diff --git a/.amlignore b/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/.amlignore.amltmp b/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/.github/actions/aml-endpoint-deploy/action.yaml b/.github/actions/aml-endpoint-deploy/action.yaml index f37c4b43..24c03b60 100644 --- a/.github/actions/aml-endpoint-deploy/action.yaml +++ b/.github/actions/aml-endpoint-deploy/action.yaml @@ -1,126 +1,137 @@ -name: Deploy AzureML managed online endpoint -description: 'Deploys a model endpoint in Azure Machine Learning Services all along with all the deployments it contains. Logs are collected and uploaded.' - -inputs: - resourceGroup: - description: 'Name of the resource group where the workspace is placed.' - required: true - workspaceName: - description: 'Name of the workspace to work against.' - required: true - endpointFile: - description: 'Path to the endpoint YAML file.' - required: true - deploymentFile: - description: 'Path to the deployment YAML file for the given endpoint.' - required: true - modelVersion: - description: 'Model version you want to deploy. Supports either a specific version number, or "latest". If not specified, using the deployment file model version.' - required: false - default: '' - updateIfExists: - description: 'If endpoint exists, update it instead of creating a new one.' - required: false - default: 'false' -outputs: - deployedVersion: - description: 'Deployed version of the model' - value: ${{ steps.deployment.outputs.deployedVersion }} - -runs: - using: "composite" - steps: - - name: Deploy endpoint - id: deployment - shell: bash - run: | - set -e - az configure --defaults workspace=${{ inputs.workspaceName }} group=${{ inputs.resourceGroup }} - - ENDPOINT_FILE=${{ inputs.endpointFile }} - DEPLOYMENT_FILE=${{ inputs.deploymentFile }} - - ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) - echo "Endpoint name: $ENDPOINT_NAME" - - # Removing traffic if present in endpoint config as we'll manage traffic setup as part of the safe rollout - echo "Rewriting endpoint file without traffic" - yq -y -i "del(.traffic)" $ENDPOINT_FILE - - # Create or update endpoint - { - echo "Creating endpoint with name: $ENDPOINT_NAME" && - az ml online-endpoint create -f $ENDPOINT_FILE - } || { - echo "Endpoint $ENDPOINT_NAME already exists" - if [ ${{ inputs.updateIfExists }} == 'true' ]; then - echo "Updating endpoint with name: $ENDPOINT_NAME" && - az ml online-endpoint update -f $ENDPOINT_FILE - else - echo "Skipping update of endpoint with name: $ENDPOINT_NAME" - fi - } - - # Identify which slot should be used to stage this deployment based on current traffic - echo "Reading endpoint traffic to identify target staging deployment slot" - az ml online-endpoint show -n $ENDPOINT_NAME --query "traffic" -o yaml > _endpoint_traffic.yml - echo "Endpoint traffic:" - cat _endpoint_traffic.yml - GREEN_TRAFFIC=$(yq .green _endpoint_traffic.yml) - BLUE_TRAFFIC=$(yq .blue _endpoint_traffic.yml) - if [[ $GREEN_TRAFFIC == null || $GREEN_TRAFFIC == 0 ]]; then - STAGING_DEPLOYMENT_NAME='green'; - else - if [[ $BLUE_TRAFFIC == null || $BLUE_TRAFFIC == 0 ]]; then - STAGING_DEPLOYMENT_NAME='blue'; - else - echo "::error::No staging slots available for endpoint $ENDPOINT_NAME. One of the green/blue slots needs to have 0% traffic."; - exit 1; - fi - fi - echo "Selected staging deployment name: $STAGING_DEPLOYMENT_NAME" - - # Updating deployment file to setup name of deployment based on staging name selected above - echo "Updating deployment name to $STAGING_DEPLOYMENT_NAME" - if [[ $STAGING_DEPLOYMENT_NAME == "blue" ]]; then - yq -y -i '.name= "blue"' $DEPLOYMENT_FILE; - else - yq -y -i '.name= "green"' $DEPLOYMENT_FILE; - fi - - # Overwrite the model version set in the deployment file with a specific version or 'latest' if specified in the workflow - DEPLOYMENT_MODEL=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f2) - DEPLOYMENT_MODEL_VERSION=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f3) - if [ -z "${{ inputs.modelVersion}}" ]; then - TARGET_MODEL_VERSION=$DEPLOYMENT_MODEL_VERSION - else - echo "Model being targeted is being overwriten with version ${{ inputs.modelVersion}}" - TARGET_MODEL_VERSION=${{ inputs.modelVersion}} - fi - if [[ "$TARGET_MODEL_VERSION" == "latest" ]]; then - echo "Identifying latest version of the model $DEPLOYMENT_MODEL" - TARGET_MODEL_VERSION=$(az ml model list --name $DEPLOYMENT_MODEL | jq -r '.[0].version') - echo "Latest version of model $DEPLOYMENT_MODEL is $TARGET_MODEL_VERSION" - fi - if [[ $TARGET_MODEL_VERSION != $DEPLOYMENT_MODEL_VERSION ]]; then - echo "Updating deployment file with model version: $TARGET_MODEL_VERSION" - sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$TARGET_MODEL_VERSION'/' $DEPLOYMENT_FILE - fi - echo "::set-output name=deployedVersion::$TARGET_MODEL_VERSION" - - # Create deployment - echo "Creating deployment with name: $ENDPOINT_NAME/$STAGING_DEPLOYMENT_NAME" - az ml online-deployment create -f $DEPLOYMENT_FILE --only-show-errors --set tags.git_commit=${GITHUB_SHA} - echo "Deployment completed" - - # Saving logs - echo "Acquiring logs for deployment with name: $ENDPOINT_NAME/$STAGING_DEPLOYMENT_NAME" - mkdir -p logs - az ml online-deployment get-logs --name $STAGING_DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME >> logs/$ENDPOINT_NAME_$STAGING_DEPLOYMENT_NAME.log - - - name: Upload deployment logs - uses: actions/upload-artifact@v2 - if: ${{ (failure() || success()) }} - with: - name: deployment-logs +name: Deploy AzureML managed online endpoint +description: 'Deploys a model endpoint in Azure Machine Learning Services all along with all the deployments it contains. Logs are collected and uploaded.' + +inputs: + resourceGroup: + description: 'Name of the resource group where the workspace is placed.' + required: true + workspaceName: + description: 'Name of the workspace to work against.' + required: true + endpointFile: + description: 'Path to the endpoint YAML file.' + required: true + deploymentFile: + description: 'Path to the deployment YAML file for the given endpoint.' + required: true + modelVersion: + description: 'Model version you want to deploy. Supports either a specific version number, or "latest". If not specified, using the deployment file model version.' + required: false + default: '' + updateIfExists: + description: 'If endpoint exists, update it instead of creating a new one.' + required: false + default: 'false' +outputs: + deployedVersion: + description: 'Deployed version of the model' + value: ${{ steps.deployment.outputs.deployedVersion }} + +runs: + using: "composite" + steps: + - name: Deploy endpoint + id: deployment + shell: bash + run: | + set -e + az configure --defaults workspace=${{ inputs.workspaceName }} group=${{ inputs.resourceGroup }} + + ENDPOINT_FILE=${{ inputs.endpointFile }} + DEPLOYMENT_FILE=${{ inputs.deploymentFile }} + + ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) + echo "Endpoint name: $ENDPOINT_NAME" + + # Removing traffic if present in endpoint config as we'll manage traffic setup as part of the safe rollout + echo "Rewriting endpoint file without traffic" + yq -y -i "del(.traffic)" $ENDPOINT_FILE + + # Create or update endpoint + { + echo "Creating endpoint with name: $ENDPOINT_NAME" && + az ml online-endpoint create -f $ENDPOINT_FILE + } || { + echo "Endpoint $ENDPOINT_NAME already exists" + if [ ${{ inputs.updateIfExists }} == 'true' ]; then + echo "Updating endpoint with name: $ENDPOINT_NAME" && + az ml online-endpoint update -f $ENDPOINT_FILE + else + echo "Skipping update of endpoint with name: $ENDPOINT_NAME" + fi + } + + # Identify which slot should be used to stage this deployment based on current traffic + echo "Reading endpoint traffic to identify target staging deployment slot" + az ml online-endpoint show -n $ENDPOINT_NAME --query "traffic" -o yaml > _endpoint_traffic.yml + echo "Endpoint traffic:" + cat _endpoint_traffic.yml + GREEN_TRAFFIC=$(yq .green _endpoint_traffic.yml) + BLUE_TRAFFIC=$(yq .blue _endpoint_traffic.yml) + if [[ $GREEN_TRAFFIC == null || $GREEN_TRAFFIC == 0 ]]; then + STAGING_DEPLOYMENT_NAME='green'; + else + if [[ $BLUE_TRAFFIC == null || $BLUE_TRAFFIC == 0 ]]; then + STAGING_DEPLOYMENT_NAME='blue'; + else + echo "::error::No staging slots available for endpoint $ENDPOINT_NAME. One of the green/blue slots needs to have 0% traffic."; + exit 1; + fi + fi + echo "Selected staging deployment name: $STAGING_DEPLOYMENT_NAME" + + # Updating deployment file to setup name of deployment based on staging name selected above + echo "Updating deployment name to $STAGING_DEPLOYMENT_NAME" + if [[ $STAGING_DEPLOYMENT_NAME == "blue" ]]; then + yq -y -i '.name= "blue"' $DEPLOYMENT_FILE; + else + yq -y -i '.name= "green"' $DEPLOYMENT_FILE; + fi + + # Overwrite the model version set in the deployment file with a specific version or 'latest' if specified in the workflow + DEPLOYMENT_MODEL=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f2) + DEPLOYMENT_MODEL_VERSION=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f3) + if [ -z "${{ inputs.modelVersion}}" ]; then + TARGET_MODEL_VERSION=$DEPLOYMENT_MODEL_VERSION + else + echo "Model being targeted is being overwriten with version ${{ inputs.modelVersion}}" + TARGET_MODEL_VERSION=${{ inputs.modelVersion}} + fi + if [[ "$TARGET_MODEL_VERSION" == "latest" ]]; then + echo "Identifying latest version of the model $DEPLOYMENT_MODEL" + TARGET_MODEL_VERSION=$(az ml model list --name $DEPLOYMENT_MODEL | jq -r '.[0].version') + echo "Latest version of model $DEPLOYMENT_MODEL is $TARGET_MODEL_VERSION" + fi + if [[ $TARGET_MODEL_VERSION != $DEPLOYMENT_MODEL_VERSION ]]; then + echo "Updating deployment file with model version: $TARGET_MODEL_VERSION" + sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$TARGET_MODEL_VERSION'/' $DEPLOYMENT_FILE + fi + echo "::set-output name=deployedVersion::$TARGET_MODEL_VERSION" + + # Create deployment + echo "Creating deployment with name: $ENDPOINT_NAME/$STAGING_DEPLOYMENT_NAME" + az ml online-deployment create -f $DEPLOYMENT_FILE --only-show-errors --set tags.git_commit=${GITHUB_SHA} + echo "Deployment completed" + + # Get the deployment list and store it in a variable + #deployment_list=$(az ml online-deployment list -e $ENDPOINT_NAME) + + # Check if the deployment list has items using jq + #if [[ $(echo "$deployment_list" | jq 'length > 0') == "true" ]]; then + # echo "Deployment $ENDPOINT_NAME already exists" + #else + # az ml online-deployment create -f $DEPLOYMENT_FILE --only-show-errors --set tags.git_commit=${GITHUB_SHA} + # echo "Deployment completed" + #fi + + # Saving logs + echo "Acquiring logs for deployment with name: $ENDPOINT_NAME/$STAGING_DEPLOYMENT_NAME" + mkdir -p logs + az ml online-deployment get-logs --name $STAGING_DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME >> logs/$ENDPOINT_NAME_$STAGING_DEPLOYMENT_NAME.log + + - name: Upload deployment logs + uses: actions/upload-artifact@v4 + if: ${{ (failure() || success()) }} + with: + name: deployment-logs path: logs/* \ No newline at end of file diff --git a/.github/actions/aml-endpoint-swap/action.yaml b/.github/actions/aml-endpoint-swap/action.yaml index b3c948e2..0a9546c8 100644 --- a/.github/actions/aml-endpoint-swap/action.yaml +++ b/.github/actions/aml-endpoint-swap/action.yaml @@ -1,58 +1,58 @@ -name: Swap AzureML managed online endpoint deployments -description: 'Swaps green/blue deployments of an Azure ML endpoint by switching traffic around between endpoint deployments.' - -inputs: - resourceGroup: - description: 'Name of the resource group where the workspace is placed.' - required: true - workspaceName: - description: 'Name of the workspace to work against.' - required: true - endpointFile: - description: 'Path to the endpoint YAML file. Wildcard paths are supported which means that all matched endpoints will be deployed.' - required: true - -runs: - using: "composite" - steps: - - name: Swap endpoint deployments - id: swap-deployments - shell: bash - run: | - set -e - az configure --defaults workspace=${{ inputs.workspaceName }} group=${{ inputs.resourceGroup }} - - ENDPOINT_FILE=${{ inputs.endpointFile }} - ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) - echo "ENDPOINT_FILE: $ENDPOINT_FILE" - echo "ENDPOINT_NAME: $ENDPOINT_NAME" - - echo "Reading endpoint traffic to figure out which deployment is staging/production" - az ml online-endpoint show -n $ENDPOINT_NAME --query "traffic" -o yaml > endpoint_traffic.yml - echo "Endpoint traffic:" - cat endpoint_traffic.yml - GREEN_TRAFFIC=$(yq .green endpoint_traffic.yml) - BLUE_TRAFFIC=$(yq .blue endpoint_traffic.yml) - - if [ $GREEN_TRAFFIC == null ]; then - if [ $BLUE_TRAFFIC == null ]; then - echo "::error::No deployment slots available for endpoint $ENDPOINT_NAME. Nothing to swap."; - exit 1; - else - echo "Setting blue traffic to 100%" - az ml online-endpoint update -n $ENDPOINT_NAME --traffic "blue=100" - fi - else - if [ $BLUE_TRAFFIC == null ]; then - echo "Setting green traffic to 100%" - az ml online-endpoint update -n $ENDPOINT_NAME --traffic "green=100" - else - if [ $GREEN_TRAFFIC == 0 ]; then - echo "Setting traffic to: green=100 blue=0" - az ml online-endpoint update -n $ENDPOINT_NAME --traffic "green=100 blue=0" - else - echo "Setting traffic to: green=0 blue=100" - az ml online-endpoint update -n $ENDPOINT_NAME --traffic "green=0 blue=100" - fi - fi +name: Swap AzureML managed online endpoint deployments +description: 'Swaps green/blue deployments of an Azure ML endpoint by switching traffic around between endpoint deployments.' + +inputs: + resourceGroup: + description: 'Name of the resource group where the workspace is placed.' + required: true + workspaceName: + description: 'Name of the workspace to work against.' + required: true + endpointFile: + description: 'Path to the endpoint YAML file. Wildcard paths are supported which means that all matched endpoints will be deployed.' + required: true + +runs: + using: "composite" + steps: + - name: Swap endpoint deployments + id: swap-deployments + shell: bash + run: | + set -e + az configure --defaults workspace=${{ inputs.workspaceName }} group=${{ inputs.resourceGroup }} + + ENDPOINT_FILE=${{ inputs.endpointFile }} + ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) + echo "ENDPOINT_FILE: $ENDPOINT_FILE" + echo "ENDPOINT_NAME: $ENDPOINT_NAME" + + echo "Reading endpoint traffic to figure out which deployment is staging/production" + az ml online-endpoint show -n $ENDPOINT_NAME --query "traffic" -o yaml > endpoint_traffic.yml + echo "Endpoint traffic:" + cat endpoint_traffic.yml + GREEN_TRAFFIC=$(yq .green endpoint_traffic.yml) + BLUE_TRAFFIC=$(yq .blue endpoint_traffic.yml) + + if [ $GREEN_TRAFFIC == null ]; then + if [ $BLUE_TRAFFIC == null ]; then + echo "::error::No deployment slots available for endpoint $ENDPOINT_NAME. Nothing to swap."; + exit 1; + else + echo "Setting blue traffic to 100%" + az ml online-endpoint update -n $ENDPOINT_NAME --traffic "blue=100" + fi + else + if [ $BLUE_TRAFFIC == null ]; then + echo "Setting green traffic to 100%" + az ml online-endpoint update -n $ENDPOINT_NAME --traffic "green=100" + else + if [ $GREEN_TRAFFIC == 0 ]; then + echo "Setting traffic to: green=100 blue=0" + az ml online-endpoint update -n $ENDPOINT_NAME --traffic "green=100 blue=0" + else + echo "Setting traffic to: green=0 blue=100" + az ml online-endpoint update -n $ENDPOINT_NAME --traffic "green=0 blue=100" + fi + fi fi \ No newline at end of file diff --git a/.github/actions/aml-endpoint-test/action.yaml b/.github/actions/aml-endpoint-test/action.yaml index 82bd0859..1b9433c3 100644 --- a/.github/actions/aml-endpoint-test/action.yaml +++ b/.github/actions/aml-endpoint-test/action.yaml @@ -1,47 +1,48 @@ -name: Test AzureML managed online endpoint deployment (0% traffic deployment) -description: 'Finds 0% traffic deployment of an Azure ML endpoint and tests it.' - -inputs: - resourceGroup: - description: 'Name of the resource group where the workspace is placed.' - required: true - workspaceName: - description: 'Name of the workspace to work against.' - required: true - endpointFile: - description: 'Path to the endpoint YAML file. Wildcard paths are supported which means that all matched endpoints will be deployed.' - required: true - requestFile: - description: 'Name of the json test request file.' - required: true - -runs: - using: "composite" - steps: - - name: Test endpoint deployments - id: test-deployment - shell: bash - run: | - set -e - az configure --defaults workspace=${{ inputs.workspaceName }} group=${{ inputs.resourceGroup }} - - ENDPOINT_FILE=${{ inputs.endpointFile }} - ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) - echo "ENDPOINT_FILE: $ENDPOINT_FILE" - echo "ENDPOINT_NAME: $ENDPOINT_NAME" - - echo "Reading endpoint traffic to figure out which deployment is staging/production" - az ml online-endpoint show -n $ENDPOINT_NAME --query "traffic" -o yaml > endpoint_traffic.yml - echo "Endpoint traffic:" - cat endpoint_traffic.yml - GREEN_TRAFFIC=$(yq .green endpoint_traffic.yml) - BLUE_TRAFFIC=$(yq .blue endpoint_traffic.yml) - if [ $GREEN_TRAFFIC == 0 ]; then - TEST_DEPLOYMENT_NAME='green' - fi - if [ $BLUE_TRAFFIC == 0 ]; then - TEST_DEPLOYMENT_NAME='blue' - fi - - TEST_RESPONSE=$(az ml online-endpoint invoke -n $ENDPOINT_NAME --deployment $TEST_DEPLOYMENT_NAME --request-file ${{ inputs.requestFile }}) +name: Test AzureML managed online endpoint deployment (0% traffic deployment) +description: 'Finds 0% traffic deployment of an Azure ML endpoint and tests it.' + +inputs: + resourceGroup: + description: 'Name of the resource group where the workspace is placed.' + required: true + workspaceName: + description: 'Name of the workspace to work against.' + required: true + endpointFile: + description: 'Path to the endpoint YAML file. Wildcard paths are supported which means that all matched endpoints will be deployed.' + required: true + requestFile: + description: 'Name of the json test request file.' + required: true + +runs: + using: "composite" + steps: + - name: Test endpoint deployments + id: test-deployment + shell: bash + run: | + set -e + az configure --defaults workspace=${{ inputs.workspaceName }} group=${{ inputs.resourceGroup }} + + ENDPOINT_FILE=${{ inputs.endpointFile }} + ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) + echo "ENDPOINT_FILE: $ENDPOINT_FILE" + echo "ENDPOINT_NAME: $ENDPOINT_NAME" + + echo "Reading endpoint traffic to figure out which deployment is staging/production" + az ml online-endpoint show -n $ENDPOINT_NAME --query "traffic" -o yaml > endpoint_traffic.yml + echo "Endpoint traffic:" + cat endpoint_traffic.yml + GREEN_TRAFFIC=$(yq .green endpoint_traffic.yml) + BLUE_TRAFFIC=$(yq .blue endpoint_traffic.yml) + if [ $GREEN_TRAFFIC == 0 ]; then + TEST_DEPLOYMENT_NAME='green' + fi + if [ $BLUE_TRAFFIC == 0 ]; then + TEST_DEPLOYMENT_NAME='blue' + fi + + TEST_RESPONSE=$(az ml online-endpoint invoke -n $ENDPOINT_NAME --deployment $TEST_DEPLOYMENT_NAME --request-file ${{ inputs.requestFile }}) + echo "TEST_RESPONSE: $TEST_RESPONSE" # TODO: test that response is valid, fail with exit 1 if not \ No newline at end of file diff --git a/.github/actions/aml-job-create/action.yaml b/.github/actions/aml-job-create/action.yaml index e3e0c019..1a604ac2 100644 --- a/.github/actions/aml-job-create/action.yaml +++ b/.github/actions/aml-job-create/action.yaml @@ -1,58 +1,59 @@ -name: Submitting job -description: 'Creates and submit a new job to Azure ML based on a job configuration. Jobs are named using the provided job name and a unique run id returned by GitHub.' - -inputs: - # name: - # description: 'Name of the job to be created. Note that the final name of the job will be the given name followed by the number of the build run `github.run_id`. Thhis value is provided as an output.' - # required: true - jobFile: - description: 'Path to the job file.' - required: true - # workspaceName: - # description: 'Name of the workspace to work against.' - # required: true - # resourceGroup: - # description: 'Name of the resource group where the workspace is placed.' - # required: true - # noWait: - # description: 'Indicates if the action should not wait for the job to finish.' - # required: false - # default: 'false' - -# outputs: -# jobName: -# description: Name of the job name created in the workspace. -# value: ${{ steps.jobRun.outputs.jobName }} - -runs: - using: "composite" - steps: - - name: Run AML Job - id: jobRun - shell: bash - run: | - run_id=$(az ml job create -f ${{ inputs.jobFile }} --query name -o tsv) - if [[ -z "$run_id" ]] - then - echo "Job creation failed" - exit 3 - fi - az ml job show -n $run_id --web - status=$(az ml job show -n $run_id --query status -o tsv) - if [[ -z "$status" ]] - then - echo "Status query failed" - exit 4 - fi - running=("Queued" "Starting" "Preparing" "Running" "Finalizing") - while [[ ${running[*]} =~ $status ]] - do - sleep 15 - status=$(az ml job show -n $run_id --query status -o tsv) - echo $status - done - if [[ "$status" = "Failed" ]] - then - echo "Training Job failed" - exit 3 - fi +name: Submitting job +description: 'Creates and submit a new job to Azure ML based on a job configuration. Jobs are named using the provided job name and a unique run id returned by GitHub.' + +inputs: + # name: + # description: 'Name of the job to be created. Note that the final name of the job will be the given name followed by the number of the build run `github.run_id`. Thhis value is provided as an output.' + # required: true + jobFile: + description: 'Path to the job file.' + required: true + # workspaceName: + # description: 'Name of the workspace to work against.' + # required: true + # resourceGroup: + # description: 'Name of the resource group where the workspace is placed.' + # required: true + # noWait: + # description: 'Indicates if the action should not wait for the job to finish.' + # required: false + # default: 'false' + +# outputs: +# jobName: +# description: Name of the job name created in the workspace. +# value: ${{ steps.jobRun.outputs.jobName }} + +runs: + using: "composite" + steps: + - name: Run AML Job + id: jobRun + shell: bash + run: | + run_id=$(az ml job create -f ${{ inputs.jobFile }} --query name -o tsv) + run_id=$(echo "$run_id" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') + if [[ -z "$run_id" ]] + then + echo "Job creation failed" + exit 3 + fi + az ml job show -n $run_id --web + status=$(az ml job show -n $run_id --query status -o tsv) + if [[ -z "$status" ]] + then + echo "Status query failed" + exit 4 + fi + running=("Queued" "Starting" "Preparing" "Running" "Finalizing") + while [[ ${running[*]} =~ $status ]] + do + sleep 15 + status=$(az ml job show -n $run_id --query status -o tsv) + echo $status + done + if [[ "$status" = "Failed" ]] + then + echo "Training Job failed" + exit 3 + fi diff --git a/.github/workflows/workshop_cd.yml b/.github/workflows/workshop_cd.yml index df3e7214..a972e3fb 100644 --- a/.github/workflows/workshop_cd.yml +++ b/.github/workflows/workshop_cd.yml @@ -1,64 +1,65 @@ -name: workshop-cd -on: - workflow_dispatch: - pull_request: - types: - - opened - branches: - - main - paths: - - src/workshop/core/** - - .github/workflows/workshop_cd.yml -jobs: - Workshop-Deployment: - runs-on: ubuntu-latest - steps: - - - name: Check out repository code - uses: actions/checkout@v2 - - - name: Setup python - uses: actions/setup-python@v2 - with: - python-version: '3.8' - - - name: Upgrade pip - run: | - python -m pip install --upgrade pip - python -m pip install --upgrade build - python -m pip install --upgrade twine - - - name: AZ Login - uses: azure/login@v1 - with: - creds: ${{ secrets.AZURE_SERVICE_PRINCIPAL }} #setup replace AZURE_SERVICE_PRINCIPAL with the name of your Azure credentials secret in GitHub - - - name: Install az ml & and tools - run: | - az extension add -n ml -y --version 2.2.1 - sudo apt install jq - pip install yq - - - name: Run deployment - uses: ./.github/actions/aml-endpoint-deploy - with: - resourceGroup: azureml #setup replace azureml with the name of your resource group in Azure - workspaceName: ws01ent #setup replace ws01ent with the name of your workspace in Azure - endpointFile: src/workshop/core/scoring/endpoint.yml - deploymentFile: src/workshop/core/scoring/deployment.yml - modelVersion: latest - - - name: Test deployment - uses: ./.github/actions/aml-endpoint-test - with: - resourceGroup: azureml #setup replace azureml with the name of your resource group in Azure - workspaceName: ws01ent #setup replace ws01ent with the name of your workspace in Azure - endpointFile: src/workshop/core/scoring/endpoint.yml - requestFile: src/workshop/core/scoring/scoring_test_request.json - - - name: Swap deployment - uses: ./.github/actions/aml-endpoint-swap - with: - resourceGroup: azureml #setup replace azureml with the name of your resource group in Azure - workspaceName: ws01ent #setup replace ws01ent with the name of your workspace in Azure - endpointFile: src/workshop/core/scoring/endpoint.yml \ No newline at end of file +name: workshop-cd +on: + workflow_dispatch: + pull_request: + types: + - opened + branches: + - main + paths: + - src/workshop/core/** + - .github/workflows/workshop_cd.yml +jobs: + Workshop-Deployment: + #runs-on: [anildwa-wsl, linux, X64] + runs-on: ubuntu-latest + steps: + + - name: Check out repository code + uses: actions/checkout@v2 + + - name: Setup python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + + - name: Upgrade pip + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade build + python -m pip install --upgrade twine + + - name: AZ Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_SERVICE_PRINCIPAL }} #setup replace AZURE_SERVICE_PRINCIPAL with the name of your Azure credentials secret in GitHub + + - name: Install az ml & and tools + run: | + az extension add -n ml -y + sudo apt install jq + pip install yq + + - name: Run deployment + uses: ./.github/actions/aml-endpoint-deploy + with: + resourceGroup: aml-rg #setup replace azureml with the name of your resource group in Azure + workspaceName: anildwa-westus3 #setup replace ws01ent with the name of your workspace in Azure + endpointFile: src/workshop/core/scoring/endpoint.yml + deploymentFile: src/workshop/core/scoring/deployment.yml + modelVersion: latest + + - name: Test deployment + uses: ./.github/actions/aml-endpoint-test + with: + resourceGroup: aml-rg #setup replace azureml with the name of your resource group in Azure + workspaceName: anildwa-westus3 #setup replace ws01ent with the name of your workspace in Azure + endpointFile: src/workshop/core/scoring/endpoint.yml + requestFile: src/workshop/core/scoring/scoring_test_request.json + + - name: Swap deployment + uses: ./.github/actions/aml-endpoint-swap + with: + resourceGroup: aml-rg #setup replace azureml with the name of your resource group in Azure + workspaceName: anildwa-westus3 #setup replace ws01ent with the name of your workspace in Azure1 + endpointFile: src/workshop/core/scoring/endpoint.yml diff --git a/.github/workflows/workshop_ci.yml b/.github/workflows/workshop_ci.yml index 66a2799a..b590448b 100644 --- a/.github/workflows/workshop_ci.yml +++ b/.github/workflows/workshop_ci.yml @@ -1,47 +1,48 @@ -name: workshop-ci -on: - workflow_dispatch: - pull_request: - types: - - closed - branches: - - integration - paths: - - src/workshop/core/** - - .github/workflows/workshop_ci.yml -jobs: - Workshop-Train-Validation: - runs-on: ubuntu-latest - steps: - - name: Check out repository code - uses: actions/checkout@v3 - - name: Setup python - uses: actions/setup-python@v2 - with: - python-version: '3.8' # Version range or exact version of a Python version to use, using SemVer's version range syntax - - name: Upgrade pip - run: | - python -m pip install --upgrade pip - python -m pip install --upgrade build - python -m pip install --upgrade twine - - name: AZ Login - uses: azure/login@v1 - with: - creds: ${{ secrets.AZURE_SERVICE_PRINCIPAL }} #setup: provide your Azure credentials name stored in github - - - name: Install az ml & set default values for AML - run: | #setup: provide group, workspace and location - az extension add -n ml -y --version 2.2.1 - az configure --defaults group=azureml workspace=ws01ent location=westus2 - - name: run training and model validation - run: | - az ml job create -s -f src/workshop/core/pipelines/training_pipeline.yml - - - name: Create Pull Request to Main - uses: thomaseizinger/create-pull-request@master - with: - GITHUB_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN_GITHUB }} #setup: provide your github secret name - head: ${{ github.ref }} - base: main - title: "An automatically created PR to main by successful CI" - +name: workshop-ci +on: + workflow_dispatch: + pull_request: + types: + - closed + branches: + - integration + paths: + - src/workshop/core/** + - .github/workflows/workshop_ci.yml +jobs: + Workshop-Train-Validation: + #runs-on: [anildwa-wsl, linux, X64] + runs-on: ubuntu-latest + steps: + - name: Check out repository code + uses: actions/checkout@v3 + - name: Setup python + uses: actions/setup-python@v2 + with: + python-version: '3.8' # Version range or exact version of a Python version to use, using SemVer's version range syntax + - name: Upgrade pip + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade build + python -m pip install --upgrade twine + - name: AZ Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_SERVICE_PRINCIPAL }} #setup: provide your Azure credentials name stored in github + + - name: Install az ml & set default values for AML + run: | #setup: provide group, workspace and location + az extension add -n ml -y --version 2.2.1 + az configure --defaults group=aml-rg workspace=anildwa-westus3 location=westus3 + - name: run training and model validation + run: | + az ml job create -s -f src/workshop/core/pipelines/training_pipeline.yml + + - name: Create Pull Request to Main + uses: thomaseizinger/create-pull-request@master + with: + GITHUB_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN_GITHUB }} #setup: provide your github secret name + head: ${{ github.ref }} + base: main + title: "An automatically created PR to main by successful CI" + diff --git a/.github/workflows/workshop_unit_test.yml b/.github/workflows/workshop_unit_test.yml index fe6dcd59..72867867 100644 --- a/.github/workflows/workshop_unit_test.yml +++ b/.github/workflows/workshop_unit_test.yml @@ -1,39 +1,40 @@ -name: feature_engineering_unit_test -on: - workflow_dispatch: - push: - branches-ignore: - - main - - integration - paths: - - src/workshop/core/data_engineering/* - - .github/workflows/workshop_unit_test.yml - -jobs: - unit-test: - runs-on: ubuntu-latest - steps: - - name: Check out repository code - uses: actions/checkout@v3 - - name: Setup python - uses: actions/setup-python@v2 - with: - python-version: '3.8' # Version range or exact version of a Python version to use, using SemVer's version range syntax - - name: Upgrade pip - run: | - python -m pip install --upgrade pip - python -m pip install --upgrade build - python -m pip install --upgrade twine - - name: AZ Login - uses: azure/login@v1 - with: - creds: ${{ secrets.AZURE_SERVICE_PRINCIPAL }} # SETUP: replace AZURE_SERVICE_PRINCIPAL with your own secret name - - name: Install AZ ML and tools - run: | # SETUP line 34 to point to your own AML workspace - az extension add -n ml -y --version 2.2.1 - az configure --defaults group=azureml workspace=ws01ent location=westus2 - - name: Run Feature Engineering - uses: ./.github/actions/aml-job-create - with: - jobFile: src/workshop/core/data_engineering/feature_engineering.yml - +name: feature_engineering_unit_test +on: + workflow_dispatch: + push: + branches-ignore: + - main + - integration + paths: + - src/workshop/core/feature_engineering/* + - .github/workflows/workshop_unit_test.yml + +jobs: + unit-test: + #runs-on: [anildwa-wsl, linux, X64] + runs-on: ubuntu-latest + steps: + - name: Check out repository code + uses: actions/checkout@v3 + - name: Setup python + uses: actions/setup-python@v2 + with: + python-version: '3.8' # Version range or exact version of a Python version to use, using SemVer's version range syntax + - name: Upgrade pip + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade build + python -m pip install --upgrade twine + - name: AZ Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_SERVICE_PRINCIPAL }} # SETUP: replace AZURE_SERVICE_PRINCIPAL with your own secret name + - name: Install AZ ML and tools + run: | # SETUP line 34 to point to your own AML workspace + az extension add -n ml -y + az configure --defaults group=aml-rg workspace=anildwa-westus3 location=westus3 + - name: Run Feature Engineering + uses: ./.github/actions/aml-job-create + with: + jobFile: src/workshop/core/data_engineering/feature_engineering.yml + diff --git a/.gitignore b/.gitignore index 5ab4839a..1808c064 100644 --- a/.gitignore +++ b/.gitignore @@ -1,136 +1,136 @@ -# -src/workshop/data/*.parquet -src/workshop/data/*.joblib -*.amlignore -*.amltmp -*.ipynb_aml_checkpoints - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker +# +src/workshop/data/*.parquet +src/workshop/data/*.joblib +*.amlignore +*.amltmp +*.ipynb_aml_checkpoints + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker .pyre/ \ No newline at end of file diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f9ba8cf6..c72a5749 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,9 +1,9 @@ -# Microsoft Open Source Code of Conduct - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). - -Resources: - -- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) -- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) -- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns +# Microsoft Open Source Code of Conduct + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). + +Resources: + +- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) +- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns diff --git a/LICENSE b/LICENSE index 9e841e7a..3d8b93bc 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,21 @@ - MIT License - - Copyright (c) Microsoft Corporation. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE diff --git a/MLOps-ADO-ADB/.azure_pipelines/cd.yml b/MLOps-ADO-ADB/.azure_pipelines/cd.yml index 07b2b7ae..e3eb01b7 100644 --- a/MLOps-ADO-ADB/.azure_pipelines/cd.yml +++ b/MLOps-ADO-ADB/.azure_pipelines/cd.yml @@ -1,144 +1,144 @@ -# Azure DevOps Pipeline to Run a Databricks Job -# This uses bash scripts to invoke the Databricks API and start a job. -# First we use the service principal's credentials to get a token from Entra -# Then we use that token to make an HTTP call to the Databricks API - -# This pipeline expects the following variables: -# - tenant_id: The ID of your Entra tenant (should be a guid) -# - sp_client_id: The service principal's client ID (should be a guid) -# - sp_credential: The service principal's credential (should be marked as a secret) -# - databricks_workspace_uri: The URI for the Databricks workspace (without the trailing slash) -# - ado_username: username for Azure DevOps with repo access to share with service principal -# - ado_username_pat: ADO personal_access_token for username - -trigger: - branches: - exclude: - - integration - include: - - main - paths: - include: - - src/workshop/notebooks/part_1_1_data_prep.ipynb - - src/workshop/notebooks/part_1_2_training.ipynb - - src/workshop/notebooks/part_1_3_evaluating.ipynb - - .azure_pipelines/cd.yml - -pool: - vmImage: ubuntu-latest - -variables: - - group: mlops-ado-adb-variables - - name: BRANCH_NAME - value: $[replace(variables['Build.SourceBranch'], 'refs/heads/', '')] - -steps: -- script: | - token=$(curl -s -X POST -H 'Content-Type: application/x-www-form-urlencoded' \ - https://login.microsoftonline.com/$(tenant_id)/oauth2/v2.0/token \ - -d 'client_id=$(sp_client_id)' \ - -d 'grant_type=client_credentials' \ - -d 'scope=2ff814a6-3304-4ab8-85cb-cd0e6f879c1d%2F.default' \ - -d 'client_secret='"$SP_CREDENTIAL"'' \ - | jq -r '.access_token') - - echo "##vso[task.setvariable variable=token;issecret=true]$token" - - displayName: 'Get Entra ID token' - env: - SP_CREDENTIAL: $(sp_credential) - -- script: | - result=$(curl -s -X GET \ - -H 'Authorization: Bearer '"$(token)"'' \ - $(databricks_workspace_uri)/api/2.0/git-credentials) - - for cred in $(echo "${result}" | jq -c '.credentials[] | {credential_id}'); do - echo "Deleting credentials" - echo $cred - cred_id=$(echo $cred | jq -r .credential_id) - del_result=$(curl -s -X DELETE \ - -H 'Authorization: Bearer '"$(token)"'' \ - $(databricks_workspace_uri)/api/2.0/git-credentials/${cred_id}) - done - - result=$(curl -s -X POST \ - -H 'Authorization: Bearer '"$(token)"'' \ - -H 'Content-Type: application/json' \ - -d '{ - "git_provider": "AzureDevOpsServices", - "personal_access_token": "$(ado_username_pat)", - "git_username": "$(ado_username)" - }' \ - $(databricks_workspace_uri)/api/2.0/git-credentials) - - echo $result - - displayName: 'Refresh Git Credentials' - -- script: | - cluster_def='{ - "spark_version": "13.2.x-cpu-ml-scala2.12", - "spark_conf": { - "spark.databricks.delta.preview.enabled": "true", - "spark.master": "local[*, 4]", - "spark.databricks.cluster.profile": "singleNode" - }, - "azure_attributes": { - "first_on_demand": 1, - "availability": "ON_DEMAND_AZURE", - "spot_bid_max_price": -1 - }, - "node_type_id": "Standard_D4a_v4", - "driver_node_type_id": "Standard_D4a_v4", - "custom_tags": { - "ResourceClass": "SingleNode" - }, - "spark_env_vars": { - "PYSPARK_PYTHON": "/databricks/python3/bin/python3" - }, - "enable_elastic_disk": true, - "data_security_mode": "LEGACY_SINGLE_USER_STANDARD", - "runtime_engine": "STANDARD", - "num_workers": 0 - }' - - result=$(curl -s -X POST \ - -H 'Authorization: Bearer '"$(token)"'' \ - -H 'Content-Type: application/json' \ - -d '{ - "run_name": "Model Eval on Prod Data Workflow - '"$(BRANCH_NAME)"'", - "tasks": [ - { - "task_key": "model_evaluation", - "notebook_task": { - "notebook_path": "src/workshop/notebooks/part_1_3_evaluating", - "source": "GIT", - "base_parameters": { - "run_name": "'"$(BRANCH_NAME)"'", - "devops_action": "Deployment" - } - }, - "new_cluster": '"$cluster_def"' - } - - ], - "git_source": { - "git_provider": "azureDevOpsServices", - "git_url": "'"$(System.CollectionUri)$(System.TeamProject)"/_git/"$(Build.Repository.Name)"'", - "git_branch": "'"$(BRANCH_NAME)"'" - }, - "access_control_list": [ - { - "group_name": "users", - "permission_level": "CAN_VIEW" - } - ] - }' \ - $(databricks_workspace_uri)/api/2.1/jobs/runs/submit) - - echo Using Git URL: "'"$(System.CollectionUri)$(System.TeamProject)"/_git/"$(Build.Repository.Name)"'" - - echo $result - +# Azure DevOps Pipeline to Run a Databricks Job +# This uses bash scripts to invoke the Databricks API and start a job. +# First we use the service principal's credentials to get a token from Entra +# Then we use that token to make an HTTP call to the Databricks API + +# This pipeline expects the following variables: +# - tenant_id: The ID of your Entra tenant (should be a guid) +# - sp_client_id: The service principal's client ID (should be a guid) +# - sp_credential: The service principal's credential (should be marked as a secret) +# - databricks_workspace_uri: The URI for the Databricks workspace (without the trailing slash) +# - ado_username: username for Azure DevOps with repo access to share with service principal +# - ado_username_pat: ADO personal_access_token for username + +trigger: + branches: + exclude: + - integration + include: + - main + paths: + include: + - src/workshop/notebooks/part_1_1_data_prep.ipynb + - src/workshop/notebooks/part_1_2_training.ipynb + - src/workshop/notebooks/part_1_3_evaluating.ipynb + - .azure_pipelines/cd.yml + +pool: + vmImage: ubuntu-latest + +variables: + - group: mlops-ado-adb-variables + - name: BRANCH_NAME + value: $[replace(variables['Build.SourceBranch'], 'refs/heads/', '')] + +steps: +- script: | + token=$(curl -s -X POST -H 'Content-Type: application/x-www-form-urlencoded' \ + https://login.microsoftonline.com/$(tenant_id)/oauth2/v2.0/token \ + -d 'client_id=$(sp_client_id)' \ + -d 'grant_type=client_credentials' \ + -d 'scope=2ff814a6-3304-4ab8-85cb-cd0e6f879c1d%2F.default' \ + -d 'client_secret='"$SP_CREDENTIAL"'' \ + | jq -r '.access_token') + + echo "##vso[task.setvariable variable=token;issecret=true]$token" + + displayName: 'Get Entra ID token' + env: + SP_CREDENTIAL: $(sp_credential) + +- script: | + result=$(curl -s -X GET \ + -H 'Authorization: Bearer '"$(token)"'' \ + $(databricks_workspace_uri)/api/2.0/git-credentials) + + for cred in $(echo "${result}" | jq -c '.credentials[] | {credential_id}'); do + echo "Deleting credentials" + echo $cred + cred_id=$(echo $cred | jq -r .credential_id) + del_result=$(curl -s -X DELETE \ + -H 'Authorization: Bearer '"$(token)"'' \ + $(databricks_workspace_uri)/api/2.0/git-credentials/${cred_id}) + done + + result=$(curl -s -X POST \ + -H 'Authorization: Bearer '"$(token)"'' \ + -H 'Content-Type: application/json' \ + -d '{ + "git_provider": "AzureDevOpsServices", + "personal_access_token": "$(ado_username_pat)", + "git_username": "$(ado_username)" + }' \ + $(databricks_workspace_uri)/api/2.0/git-credentials) + + echo $result + + displayName: 'Refresh Git Credentials' + +- script: | + cluster_def='{ + "spark_version": "13.2.x-cpu-ml-scala2.12", + "spark_conf": { + "spark.databricks.delta.preview.enabled": "true", + "spark.master": "local[*, 4]", + "spark.databricks.cluster.profile": "singleNode" + }, + "azure_attributes": { + "first_on_demand": 1, + "availability": "ON_DEMAND_AZURE", + "spot_bid_max_price": -1 + }, + "node_type_id": "Standard_D4a_v4", + "driver_node_type_id": "Standard_D4a_v4", + "custom_tags": { + "ResourceClass": "SingleNode" + }, + "spark_env_vars": { + "PYSPARK_PYTHON": "/databricks/python3/bin/python3" + }, + "enable_elastic_disk": true, + "data_security_mode": "LEGACY_SINGLE_USER_STANDARD", + "runtime_engine": "STANDARD", + "num_workers": 0 + }' + + result=$(curl -s -X POST \ + -H 'Authorization: Bearer '"$(token)"'' \ + -H 'Content-Type: application/json' \ + -d '{ + "run_name": "Model Eval on Prod Data Workflow - '"$(BRANCH_NAME)"'", + "tasks": [ + { + "task_key": "model_evaluation", + "notebook_task": { + "notebook_path": "src/workshop/notebooks/part_1_3_evaluating", + "source": "GIT", + "base_parameters": { + "run_name": "'"$(BRANCH_NAME)"'", + "devops_action": "Deployment" + } + }, + "new_cluster": '"$cluster_def"' + } + + ], + "git_source": { + "git_provider": "azureDevOpsServices", + "git_url": "'"$(System.CollectionUri)$(System.TeamProject)"/_git/"$(Build.Repository.Name)"'", + "git_branch": "'"$(BRANCH_NAME)"'" + }, + "access_control_list": [ + { + "group_name": "users", + "permission_level": "CAN_VIEW" + } + ] + }' \ + $(databricks_workspace_uri)/api/2.1/jobs/runs/submit) + + echo Using Git URL: "'"$(System.CollectionUri)$(System.TeamProject)"/_git/"$(Build.Repository.Name)"'" + + echo $result + displayName: 'Run Production Model Evaluation Databricks Workflow via API' \ No newline at end of file diff --git a/MLOps-ADO-ADB/.azure_pipelines/ci.yml b/MLOps-ADO-ADB/.azure_pipelines/ci.yml index 19489698..566aef6d 100644 --- a/MLOps-ADO-ADB/.azure_pipelines/ci.yml +++ b/MLOps-ADO-ADB/.azure_pipelines/ci.yml @@ -1,168 +1,168 @@ -# Azure DevOps Pipeline to Run a Databricks Job -# This uses bash scripts to invoke the Databricks API and start a job. -# First we use the service principal's credentials to get a token from Entra -# Then we use that token to make an HTTP call to the Databricks API - -# This pipeline expects the following variables: -# - tenant_id: The ID of your Entra tenant (should be a guid) -# - sp_client_id: The service principal's client ID (should be a guid) -# - sp_credential: The service principal's credential (should be marked as a secret) -# - databricks_workspace_uri: The URI for the Databricks workspace (without the trailing slash) -# - ado_username: username for Azure DevOps with repo access to share with service principal -# - ado_username_pat: ADO personal_access_token for username - -trigger: - branches: - exclude: - - main - include: - - integration - paths: - include: - - src/workshop/notebooks/part_1_1_data_prep.ipynb - - src/workshop/notebooks/part_1_2_training.ipynb - - src/workshop/notebooks/part_1_3_evaluating.ipynb - - .azure_pipelines/ci.yml - -pool: - vmImage: ubuntu-latest - -variables: - - group: mlops-ado-adb-variables - - name: BRANCH_NAME - value: $[replace(variables['Build.SourceBranch'], 'refs/heads/', '')] - -steps: -- script: | - token=$(curl -s -X POST -H 'Content-Type: application/x-www-form-urlencoded' \ - https://login.microsoftonline.com/$(tenant_id)/oauth2/v2.0/token \ - -d 'client_id=$(sp_client_id)' \ - -d 'grant_type=client_credentials' \ - -d 'scope=2ff814a6-3304-4ab8-85cb-cd0e6f879c1d%2F.default' \ - -d 'client_secret='"$SP_CREDENTIAL"'' \ - | jq -r '.access_token') - - echo "##vso[task.setvariable variable=token;issecret=true]$token" - - displayName: 'Get Entra ID token' - env: - SP_CREDENTIAL: $(sp_credential) - -- script: | - result=$(curl -s -X GET \ - -H 'Authorization: Bearer '"$(token)"'' \ - $(databricks_workspace_uri)/api/2.0/git-credentials) - - for cred in $(echo "${result}" | jq -c '.credentials[] | {credential_id}'); do - echo "Deleting credentials" - echo $cred - cred_id=$(echo $cred | jq -r .credential_id) - del_result=$(curl -s -X DELETE \ - -H 'Authorization: Bearer '"$(token)"'' \ - $(databricks_workspace_uri)/api/2.0/git-credentials/${cred_id}) - done - - result=$(curl -s -X POST \ - -H 'Authorization: Bearer '"$(token)"'' \ - -H 'Content-Type: application/json' \ - -d '{ - "git_provider": "AzureDevOpsServices", - "personal_access_token": "$(ado_username_pat)", - "git_username": "$(ado_username)" - }' \ - $(databricks_workspace_uri)/api/2.0/git-credentials) - - echo $result - - displayName: 'Refresh Git Credentials' - -- script: | - cluster_def='{ - "spark_version": "13.2.x-cpu-ml-scala2.12", - "spark_conf": { - "spark.databricks.delta.preview.enabled": "true", - "spark.master": "local[*, 4]", - "spark.databricks.cluster.profile": "singleNode" - }, - "azure_attributes": { - "first_on_demand": 1, - "availability": "ON_DEMAND_AZURE", - "spot_bid_max_price": -1 - }, - "node_type_id": "Standard_D4a_v4", - "driver_node_type_id": "Standard_D4a_v4", - "custom_tags": { - "ResourceClass": "SingleNode" - }, - "spark_env_vars": { - "PYSPARK_PYTHON": "/databricks/python3/bin/python3" - }, - "enable_elastic_disk": true, - "data_security_mode": "LEGACY_SINGLE_USER_STANDARD", - "runtime_engine": "STANDARD", - "num_workers": 0 - }' - - result=$(curl -s -X POST \ - -H 'Authorization: Bearer '"$(token)"'' \ - -H 'Content-Type: application/json' \ - -d '{ - "run_name": "Model Training Workflow - '"$(BRANCH_NAME)"'", - "tasks": [ - { - "task_key": "data_prep", - "notebook_task": { - "notebook_path": "src/workshop/notebooks/part_1_1_data_prep", - "source": "GIT", - "base_parameters": { - "run_name": "'"$(BRANCH_NAME)"'" - } - }, - "new_cluster": '"$cluster_def"' - }, - { - "task_key": "model_training", - "notebook_task": { - "notebook_path": "src/workshop/notebooks/part_1_2_training", - "source": "GIT", - "base_parameters": { - "run_name": "'"$(BRANCH_NAME)"'" - } - }, - "depends_on": [ {"task_key": "data_prep"} ], - "new_cluster": '"$cluster_def"' - }, - { - "task_key": "model_evaluation", - "notebook_task": { - "notebook_path": "src/workshop/notebooks/part_1_3_evaluating", - "source": "GIT", - "base_parameters": { - "run_name": "'"$(BRANCH_NAME)"'", - "devops_action": "Integration" - } - }, - "depends_on": [ {"task_key": "model_training"} ], - "new_cluster": '"$cluster_def"' - } - - ], - "git_source": { - "git_provider": "azureDevOpsServices", - "git_url": "'"$(System.CollectionUri)$(System.TeamProject)"/_git/"$(Build.Repository.Name)"'", - "git_branch": "'"$(BRANCH_NAME)"'" - }, - "access_control_list": [ - { - "group_name": "users", - "permission_level": "CAN_VIEW" - } - ] - }' \ - $(databricks_workspace_uri)/api/2.1/jobs/runs/submit) - - echo Using Git URL: "'"$(System.CollectionUri)$(System.TeamProject)"/_git/"$(Build.Repository.Name)"'" - - echo $result - - displayName: 'Run Model Training Databricks Workflow via API' +# Azure DevOps Pipeline to Run a Databricks Job +# This uses bash scripts to invoke the Databricks API and start a job. +# First we use the service principal's credentials to get a token from Entra +# Then we use that token to make an HTTP call to the Databricks API + +# This pipeline expects the following variables: +# - tenant_id: The ID of your Entra tenant (should be a guid) +# - sp_client_id: The service principal's client ID (should be a guid) +# - sp_credential: The service principal's credential (should be marked as a secret) +# - databricks_workspace_uri: The URI for the Databricks workspace (without the trailing slash) +# - ado_username: username for Azure DevOps with repo access to share with service principal +# - ado_username_pat: ADO personal_access_token for username + +trigger: + branches: + exclude: + - main + include: + - integration + paths: + include: + - src/workshop/notebooks/part_1_1_data_prep.ipynb + - src/workshop/notebooks/part_1_2_training.ipynb + - src/workshop/notebooks/part_1_3_evaluating.ipynb + - .azure_pipelines/ci.yml + +pool: + vmImage: ubuntu-latest + +variables: + - group: mlops-ado-adb-variables + - name: BRANCH_NAME + value: $[replace(variables['Build.SourceBranch'], 'refs/heads/', '')] + +steps: +- script: | + token=$(curl -s -X POST -H 'Content-Type: application/x-www-form-urlencoded' \ + https://login.microsoftonline.com/$(tenant_id)/oauth2/v2.0/token \ + -d 'client_id=$(sp_client_id)' \ + -d 'grant_type=client_credentials' \ + -d 'scope=2ff814a6-3304-4ab8-85cb-cd0e6f879c1d%2F.default' \ + -d 'client_secret='"$SP_CREDENTIAL"'' \ + | jq -r '.access_token') + + echo "##vso[task.setvariable variable=token;issecret=true]$token" + + displayName: 'Get Entra ID token' + env: + SP_CREDENTIAL: $(sp_credential) + +- script: | + result=$(curl -s -X GET \ + -H 'Authorization: Bearer '"$(token)"'' \ + $(databricks_workspace_uri)/api/2.0/git-credentials) + + for cred in $(echo "${result}" | jq -c '.credentials[] | {credential_id}'); do + echo "Deleting credentials" + echo $cred + cred_id=$(echo $cred | jq -r .credential_id) + del_result=$(curl -s -X DELETE \ + -H 'Authorization: Bearer '"$(token)"'' \ + $(databricks_workspace_uri)/api/2.0/git-credentials/${cred_id}) + done + + result=$(curl -s -X POST \ + -H 'Authorization: Bearer '"$(token)"'' \ + -H 'Content-Type: application/json' \ + -d '{ + "git_provider": "AzureDevOpsServices", + "personal_access_token": "$(ado_username_pat)", + "git_username": "$(ado_username)" + }' \ + $(databricks_workspace_uri)/api/2.0/git-credentials) + + echo $result + + displayName: 'Refresh Git Credentials' + +- script: | + cluster_def='{ + "spark_version": "13.2.x-cpu-ml-scala2.12", + "spark_conf": { + "spark.databricks.delta.preview.enabled": "true", + "spark.master": "local[*, 4]", + "spark.databricks.cluster.profile": "singleNode" + }, + "azure_attributes": { + "first_on_demand": 1, + "availability": "ON_DEMAND_AZURE", + "spot_bid_max_price": -1 + }, + "node_type_id": "Standard_D4a_v4", + "driver_node_type_id": "Standard_D4a_v4", + "custom_tags": { + "ResourceClass": "SingleNode" + }, + "spark_env_vars": { + "PYSPARK_PYTHON": "/databricks/python3/bin/python3" + }, + "enable_elastic_disk": true, + "data_security_mode": "LEGACY_SINGLE_USER_STANDARD", + "runtime_engine": "STANDARD", + "num_workers": 0 + }' + + result=$(curl -s -X POST \ + -H 'Authorization: Bearer '"$(token)"'' \ + -H 'Content-Type: application/json' \ + -d '{ + "run_name": "Model Training Workflow - '"$(BRANCH_NAME)"'", + "tasks": [ + { + "task_key": "data_prep", + "notebook_task": { + "notebook_path": "src/workshop/notebooks/part_1_1_data_prep", + "source": "GIT", + "base_parameters": { + "run_name": "'"$(BRANCH_NAME)"'" + } + }, + "new_cluster": '"$cluster_def"' + }, + { + "task_key": "model_training", + "notebook_task": { + "notebook_path": "src/workshop/notebooks/part_1_2_training", + "source": "GIT", + "base_parameters": { + "run_name": "'"$(BRANCH_NAME)"'" + } + }, + "depends_on": [ {"task_key": "data_prep"} ], + "new_cluster": '"$cluster_def"' + }, + { + "task_key": "model_evaluation", + "notebook_task": { + "notebook_path": "src/workshop/notebooks/part_1_3_evaluating", + "source": "GIT", + "base_parameters": { + "run_name": "'"$(BRANCH_NAME)"'", + "devops_action": "Integration" + } + }, + "depends_on": [ {"task_key": "model_training"} ], + "new_cluster": '"$cluster_def"' + } + + ], + "git_source": { + "git_provider": "azureDevOpsServices", + "git_url": "'"$(System.CollectionUri)$(System.TeamProject)"/_git/"$(Build.Repository.Name)"'", + "git_branch": "'"$(BRANCH_NAME)"'" + }, + "access_control_list": [ + { + "group_name": "users", + "permission_level": "CAN_VIEW" + } + ] + }' \ + $(databricks_workspace_uri)/api/2.1/jobs/runs/submit) + + echo Using Git URL: "'"$(System.CollectionUri)$(System.TeamProject)"/_git/"$(Build.Repository.Name)"'" + + echo $result + + displayName: 'Run Model Training Databricks Workflow via API' diff --git a/MLOps-ADO-ADB/.azure_pipelines/move_model.yml b/MLOps-ADO-ADB/.azure_pipelines/move_model.yml index 3837fd59..8488415f 100644 --- a/MLOps-ADO-ADB/.azure_pipelines/move_model.yml +++ b/MLOps-ADO-ADB/.azure_pipelines/move_model.yml @@ -1,34 +1,34 @@ -trigger: - branches: - include: - - dev - paths: - include: - - src/* - -variables: -- template: variables.yml - -pool: - vmImage: ubuntu-latest - -stages: -- stage: move_model - displayName: Model Model - jobs: - - job: move - steps: - - task: ShellScript@2 - displayName: 'Install Requirements' - inputs: - scriptPath: 'src/install_requirements.sh' - - template: templates/aml-model-register/step.yml - parameters: - azureServiceConnectionName: ${{ variables.azureServiceConnection }} - azureServiceConnectionNameTarget: ${{ variables.azureServiceConnectionProd }} - name: ${{ variables.name }} - workspaceName: ${{ variables.workspace }} - workspaceNameTarget: ${{ variables.workspaceProd }} - resourceGroup: ${{ variables.resourcegroup }} - resourceGroupTarget: ${{ variables.resourcegroupProd }} +trigger: + branches: + include: + - dev + paths: + include: + - src/* + +variables: +- template: variables.yml + +pool: + vmImage: ubuntu-latest + +stages: +- stage: move_model + displayName: Model Model + jobs: + - job: move + steps: + - task: ShellScript@2 + displayName: 'Install Requirements' + inputs: + scriptPath: 'src/install_requirements.sh' + - template: templates/aml-model-register/step.yml + parameters: + azureServiceConnectionName: ${{ variables.azureServiceConnection }} + azureServiceConnectionNameTarget: ${{ variables.azureServiceConnectionProd }} + name: ${{ variables.name }} + workspaceName: ${{ variables.workspace }} + workspaceNameTarget: ${{ variables.workspaceProd }} + resourceGroup: ${{ variables.resourcegroup }} + resourceGroupTarget: ${{ variables.resourcegroupProd }} modelPath: ${{ variables.modelPath }} \ No newline at end of file diff --git a/MLOps-ADO-ADB/.azure_pipelines/templates/aml-batch-score-deploy/step.yml b/MLOps-ADO-ADB/.azure_pipelines/templates/aml-batch-score-deploy/step.yml index 1ffc294c..65f3fbd4 100644 --- a/MLOps-ADO-ADB/.azure_pipelines/templates/aml-batch-score-deploy/step.yml +++ b/MLOps-ADO-ADB/.azure_pipelines/templates/aml-batch-score-deploy/step.yml @@ -1,133 +1,133 @@ -parameters: -- name: azureServiceConnectionName - type: string -- name: endpointFile - type: string -- name: deploymentFile - type: string -- name: modelVersion - type: string - default: -- name: workspaceName - type: string -- name: resourceGroup - type: string -- name: noWait - type: boolean - default: false -- name: args - type: string - default: -- name: secretsToKeyVault - type: boolean - default: false -- name: keyVaultName - type: string - default: - -steps: - - task: AzureCLI@2 - name: deployment - displayName: Deploying endpoint - inputs: - scriptType: bash - scriptLocation: inlineScript - azureSubscription: ${{ parameters.azureServiceConnectionName }} - inlineScript: | - set -e #Fail on errors - az configure --defaults workspace=${{ parameters.workspaceName }} group=${{ parameters.resourceGroup }} - ENDPOINT_FILES=$(find ${{ parameters.endpointFile }}) - for ENDPOINT_FILE in $ENDPOINT_FILES - do - ENDPOINT_FOLDER=$(dirname $ENDPOINT_FILE) - ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) - ENDPOINT_AUTH=$(yq -r ".auth_mode" $ENDPOINT_FILE) - # We are removing traffic key since this has the chicken and the egg issue. If you place .traffic you have - # to deploy the deployment first. But you can't deploy deployments without an endpoint. - echo "##[debug]Rewriting endpoint file without traffic" - yq -y "del(.traffic)" $ENDPOINT_FILE > $ENDPOINT_NAME.yml - echo "##[debug]Creating endpoint with name: $ENDPOINT_NAME" - if [[ $(az ml batch-endpoint show -n $ENDPOINT_NAME) ]]; then - echo "##[debug]Endpoint $ENDPOINT_NAME already exits. Creation skipped." - if [[ $(az ml batch-endpoint show -n $ENDPOINT_NAME | yq .auth_mode != "$ENDPOINT_AUTH") ]]; then - echo "##vso[task.logissue type=warning;sourcepath=$ENDPOINT_FILE;]Endpoint $ENDPOINT_NAME indicates a different authentication method that requires redeployment." - fi - else - az ml batch-endpoint create -f $ENDPOINT_NAME.yml - fi - # echo "##[debug]Retrieving URL and credentials" - # SCORING_URI=$(az ml batch-endpoint show -n $ENDPOINT_NAME | jq -r ".scoring_uri") - # SCORING_KEY=$(az ml batch-endpoint get-credentials -n $ENDPOINT_NAME -o tsv --query primaryKey) - - echo "##[debug]Looking for deployments in folder $ENDPOINT_FOLDER/${{ parameters.deploymentFile }}" - DEPLOYMENT_FILES=$(find $ENDPOINT_FOLDER/${{ parameters.deploymentFile }}) - - for DEPLOYMENT_FILE in $DEPLOYMENT_FILES - do - echo "##[debug]Working on deployment file $DEPLOYMENT_FILE" - DEPLOYMENT_NAME=$(yq -r ".name" $DEPLOYMENT_FILE) - DEPLOYMENT_MODEL=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f2) - DEPLOYMENT_MODEL_VERSION=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f3) - # User can overwrite the version in the YAML - if [[ "${{ parameters.modelVersion }}" == "" ]]; then - TARGET_MODEL_VERSION=$DEPLOYMENT_MODEL_VERSION - else - echo "##[debug]Model being targeted is being overwriten with version ${{ parameters.modelVersion }}" - TARGET_MODEL_VERSION=${{ parameters.modelVersion }} - fi - - echo "##[debug]Working on deployment with name: $ENDPOINT_NAME/$DEPLOYMENT_NAME" - if [[ "$TARGET_MODEL_VERSION" == "current" ]]; then - echo "##[debug]Identifying current version of the model at deployment $ENDPOINT_NAME/$DEPLOYMENT_NAME" - MODEL_CURRENT_URL=$(az ml batch-deployment show --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME -g ${{ parameters.resourceGroup }} -w ${{ parameters.workspaceName }} | jq -r ".model") - MODEL_CURRENT=$(basename $MODEL_CURRENT_URL) - echo "##[debug]Updating yaml files with current model version: $MODEL_CURRENT" - sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$MODEL_CURRENT'/' $DEPLOYMENT_FILE - fi - if [[ "$TARGET_MODEL_VERSION" == "latest" ]]; then - echo "##[debug]Identifying latest version of the model $DEPLOYMENT_MODEL" - MODEL_LATEST=$(az ml model list --name $DEPLOYMENT_MODEL -g ${{ parameters.resourceGroup }} -w ${{ parameters.workspaceName }} | jq -r '.[0].version') - - echo "##[debug]Updating yaml files with latest model version: $MODEL_LATEST" - sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$MODEL_LATEST'/' $DEPLOYMENT_FILE - fi - if [[ "$TARGET_MODEL_VERSION" == *=* ]]; then - echo "##[debug]Identifying version of the model $DEPLOYMENT_MODEL with tags $TARGET_MODEL_VERSION" - TARGET_MODEL_TAG=$(echo $TARGET_MODEL_VERSION | cut -d= -f1) - TARGET_MODEL_TVALUE=$(echo $TARGET_MODEL_VERSION | cut -d= -f2) - MODEL_TAGGED=$(az ml model list -n $DEPLOYMENT_MODEL | jq -r --arg TARGET_MODEL_TAG $TARGET_MODEL_TAG --arg TARGET_MODEL_TVALUE $TARGET_MODEL_TVALUE '.[] | select(.tags[$TARGET_MODEL_TAG] == $TARGET_MODEL_TVALUE) | .version') - echo "##[debug]Updating yaml files with model version: $MODEL_TAGGED" - sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$MODEL_TAGGED'/' $DEPLOYMENT_FILE - fi - - echo "##[debug]Creating deployment with name: $ENDPOINT_NAME/$DEPLOYMENT_NAME" - if ${{ lower(parameters.noWait) }}; then - az ml batch-deployment create -f $DEPLOYMENT_FILE --only-show-errors --no-wait --set-default - else - az ml batch-deployment create -f $DEPLOYMENT_FILE --set-default - - - fi - - # echo "##[debug]Updating properties for deployment" - # BRANCH_REF=$(Build.SourceBranch) - # az ml batch-deployment update --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --set tags.'Git commit'=$(Build.SourceVersion) - # az ml batch-deployment update --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --set tags.'Git branch'=${BRANCH_REF#refs/*/} - # az ml batch-deployment update --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --set tags.'Git repository'=$(Build.Repository.Uri) - - echo "##[debug]Deployment completed" - done - - # if ${{ lower(parameters.secretsToKeyVault) }}; then - # echo "##[debug]Uploading secrets to key vault ${{ parameters.keyVaultName }}" - # az keyvault secret set --vault-name ${{ parameters.keyVaultName }} --name ${ENDPOINT_NAME//-/}ScoringUrl --value $SCORING_URI - # az keyvault secret set --vault-name ${{ parameters.keyVaultName }} --name ${ENDPOINT_NAME//-/}ScoringKey --value $SCORING_KEY - # fi - - echo "##[debug]Getting deployed version for model at file $DEPLOYMENT_FILE" - DEPLOYED_VERSION=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f3) - echo "##vso[task.setvariable variable=deployedVersion;isOutput=true]$DEPLOYED_VERSION" - echo "##[debug]Deployed version is: $DEPLOYED_VERSION" - - echo "##[debug]Endpoint evaluation completed" +parameters: +- name: azureServiceConnectionName + type: string +- name: endpointFile + type: string +- name: deploymentFile + type: string +- name: modelVersion + type: string + default: +- name: workspaceName + type: string +- name: resourceGroup + type: string +- name: noWait + type: boolean + default: false +- name: args + type: string + default: +- name: secretsToKeyVault + type: boolean + default: false +- name: keyVaultName + type: string + default: + +steps: + - task: AzureCLI@2 + name: deployment + displayName: Deploying endpoint + inputs: + scriptType: bash + scriptLocation: inlineScript + azureSubscription: ${{ parameters.azureServiceConnectionName }} + inlineScript: | + set -e #Fail on errors + az configure --defaults workspace=${{ parameters.workspaceName }} group=${{ parameters.resourceGroup }} + ENDPOINT_FILES=$(find ${{ parameters.endpointFile }}) + for ENDPOINT_FILE in $ENDPOINT_FILES + do + ENDPOINT_FOLDER=$(dirname $ENDPOINT_FILE) + ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) + ENDPOINT_AUTH=$(yq -r ".auth_mode" $ENDPOINT_FILE) + # We are removing traffic key since this has the chicken and the egg issue. If you place .traffic you have + # to deploy the deployment first. But you can't deploy deployments without an endpoint. + echo "##[debug]Rewriting endpoint file without traffic" + yq -y "del(.traffic)" $ENDPOINT_FILE > $ENDPOINT_NAME.yml + echo "##[debug]Creating endpoint with name: $ENDPOINT_NAME" + if [[ $(az ml batch-endpoint show -n $ENDPOINT_NAME) ]]; then + echo "##[debug]Endpoint $ENDPOINT_NAME already exits. Creation skipped." + if [[ $(az ml batch-endpoint show -n $ENDPOINT_NAME | yq .auth_mode != "$ENDPOINT_AUTH") ]]; then + echo "##vso[task.logissue type=warning;sourcepath=$ENDPOINT_FILE;]Endpoint $ENDPOINT_NAME indicates a different authentication method that requires redeployment." + fi + else + az ml batch-endpoint create -f $ENDPOINT_NAME.yml + fi + # echo "##[debug]Retrieving URL and credentials" + # SCORING_URI=$(az ml batch-endpoint show -n $ENDPOINT_NAME | jq -r ".scoring_uri") + # SCORING_KEY=$(az ml batch-endpoint get-credentials -n $ENDPOINT_NAME -o tsv --query primaryKey) + + echo "##[debug]Looking for deployments in folder $ENDPOINT_FOLDER/${{ parameters.deploymentFile }}" + DEPLOYMENT_FILES=$(find $ENDPOINT_FOLDER/${{ parameters.deploymentFile }}) + + for DEPLOYMENT_FILE in $DEPLOYMENT_FILES + do + echo "##[debug]Working on deployment file $DEPLOYMENT_FILE" + DEPLOYMENT_NAME=$(yq -r ".name" $DEPLOYMENT_FILE) + DEPLOYMENT_MODEL=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f2) + DEPLOYMENT_MODEL_VERSION=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f3) + # User can overwrite the version in the YAML + if [[ "${{ parameters.modelVersion }}" == "" ]]; then + TARGET_MODEL_VERSION=$DEPLOYMENT_MODEL_VERSION + else + echo "##[debug]Model being targeted is being overwriten with version ${{ parameters.modelVersion }}" + TARGET_MODEL_VERSION=${{ parameters.modelVersion }} + fi + + echo "##[debug]Working on deployment with name: $ENDPOINT_NAME/$DEPLOYMENT_NAME" + if [[ "$TARGET_MODEL_VERSION" == "current" ]]; then + echo "##[debug]Identifying current version of the model at deployment $ENDPOINT_NAME/$DEPLOYMENT_NAME" + MODEL_CURRENT_URL=$(az ml batch-deployment show --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME -g ${{ parameters.resourceGroup }} -w ${{ parameters.workspaceName }} | jq -r ".model") + MODEL_CURRENT=$(basename $MODEL_CURRENT_URL) + echo "##[debug]Updating yaml files with current model version: $MODEL_CURRENT" + sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$MODEL_CURRENT'/' $DEPLOYMENT_FILE + fi + if [[ "$TARGET_MODEL_VERSION" == "latest" ]]; then + echo "##[debug]Identifying latest version of the model $DEPLOYMENT_MODEL" + MODEL_LATEST=$(az ml model list --name $DEPLOYMENT_MODEL -g ${{ parameters.resourceGroup }} -w ${{ parameters.workspaceName }} | jq -r '.[0].version') + + echo "##[debug]Updating yaml files with latest model version: $MODEL_LATEST" + sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$MODEL_LATEST'/' $DEPLOYMENT_FILE + fi + if [[ "$TARGET_MODEL_VERSION" == *=* ]]; then + echo "##[debug]Identifying version of the model $DEPLOYMENT_MODEL with tags $TARGET_MODEL_VERSION" + TARGET_MODEL_TAG=$(echo $TARGET_MODEL_VERSION | cut -d= -f1) + TARGET_MODEL_TVALUE=$(echo $TARGET_MODEL_VERSION | cut -d= -f2) + MODEL_TAGGED=$(az ml model list -n $DEPLOYMENT_MODEL | jq -r --arg TARGET_MODEL_TAG $TARGET_MODEL_TAG --arg TARGET_MODEL_TVALUE $TARGET_MODEL_TVALUE '.[] | select(.tags[$TARGET_MODEL_TAG] == $TARGET_MODEL_TVALUE) | .version') + echo "##[debug]Updating yaml files with model version: $MODEL_TAGGED" + sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$MODEL_TAGGED'/' $DEPLOYMENT_FILE + fi + + echo "##[debug]Creating deployment with name: $ENDPOINT_NAME/$DEPLOYMENT_NAME" + if ${{ lower(parameters.noWait) }}; then + az ml batch-deployment create -f $DEPLOYMENT_FILE --only-show-errors --no-wait --set-default + else + az ml batch-deployment create -f $DEPLOYMENT_FILE --set-default + + + fi + + # echo "##[debug]Updating properties for deployment" + # BRANCH_REF=$(Build.SourceBranch) + # az ml batch-deployment update --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --set tags.'Git commit'=$(Build.SourceVersion) + # az ml batch-deployment update --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --set tags.'Git branch'=${BRANCH_REF#refs/*/} + # az ml batch-deployment update --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --set tags.'Git repository'=$(Build.Repository.Uri) + + echo "##[debug]Deployment completed" + done + + # if ${{ lower(parameters.secretsToKeyVault) }}; then + # echo "##[debug]Uploading secrets to key vault ${{ parameters.keyVaultName }}" + # az keyvault secret set --vault-name ${{ parameters.keyVaultName }} --name ${ENDPOINT_NAME//-/}ScoringUrl --value $SCORING_URI + # az keyvault secret set --vault-name ${{ parameters.keyVaultName }} --name ${ENDPOINT_NAME//-/}ScoringKey --value $SCORING_KEY + # fi + + echo "##[debug]Getting deployed version for model at file $DEPLOYMENT_FILE" + DEPLOYED_VERSION=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f3) + echo "##vso[task.setvariable variable=deployedVersion;isOutput=true]$DEPLOYED_VERSION" + echo "##[debug]Deployed version is: $DEPLOYED_VERSION" + + echo "##[debug]Endpoint evaluation completed" done \ No newline at end of file diff --git a/MLOps-ADO-ADB/.azure_pipelines/templates/aml-endpoint-deploy/step.yml b/MLOps-ADO-ADB/.azure_pipelines/templates/aml-endpoint-deploy/step.yml index 13027eb5..5b1a297d 100644 --- a/MLOps-ADO-ADB/.azure_pipelines/templates/aml-endpoint-deploy/step.yml +++ b/MLOps-ADO-ADB/.azure_pipelines/templates/aml-endpoint-deploy/step.yml @@ -1,174 +1,174 @@ -parameters: -- name: azureServiceConnectionName - type: string -- name: endpointFile - type: string -- name: deploymentFile - type: string -- name: modelVersion - type: string - default: -- name: workspaceName - type: string -- name: resourceGroup - type: string -- name: noWait - type: boolean - default: false -- name: args - type: string - default: -- name: secretsToKeyVault - type: boolean - default: false -- name: keyVaultName - type: string - default: - -steps: - - task: AzureCLI@2 - name: deployment - displayName: Deploying endpoint - inputs: - scriptType: bash - scriptLocation: inlineScript - azureSubscription: ${{ parameters.azureServiceConnectionName }} - inlineScript: | - set -e #Fail on errors - az configure --defaults workspace=${{ parameters.workspaceName }} group=${{ parameters.resourceGroup }} - ENDPOINT_FILES=$(find ${{ parameters.endpointFile }}) - for ENDPOINT_FILE in $ENDPOINT_FILES - do - ENDPOINT_FOLDER=$(dirname $ENDPOINT_FILE) - ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) - ENDPOINT_AUTH=$(yq -r ".auth_mode" $ENDPOINT_FILE) - # We are removing traffic key since this has the chicken and the egg issue. If you place .traffic you have - # to deploy the deployment first. But you can't deploy deployments without an endpoint. - echo "##[debug]Rewriting endpoint file without traffic" - yq -y "del(.traffic)" $ENDPOINT_FILE > $ENDPOINT_NAME.yml - echo "##[debug]Creating endpoint with name: $ENDPOINT_NAME" - if [[ $(az ml online-endpoint show -n $ENDPOINT_NAME) ]]; then - echo "##[debug]Endpoint $ENDPOINT_NAME already exits. Creation skipped." - if [[ $(az ml online-endpoint show -n $ENDPOINT_NAME | yq .auth_mode != "$ENDPOINT_AUTH") ]]; then - echo "##vso[task.logissue type=warning;sourcepath=$ENDPOINT_FILE;]Endpoint $ENDPOINT_NAME indicates a different authentication method that requires redeployment." - fi - else - az ml online-endpoint create -f $ENDPOINT_NAME.yml - fi - echo "##[debug]Retrieving URL and credentials" - SCORING_URI=$(az ml online-endpoint show -n $ENDPOINT_NAME | jq -r ".scoring_uri") - SCORING_KEY=$(az ml online-endpoint get-credentials -n $ENDPOINT_NAME -o tsv --query primaryKey) - - echo "##[debug]Looking for deployments in folder $ENDPOINT_FOLDER/${{ parameters.deploymentFile }}" - DEPLOYMENT_FILES=$(find $ENDPOINT_FOLDER/${{ parameters.deploymentFile }}) - - for DEPLOYMENT_FILE in $DEPLOYMENT_FILES - do - echo "##[debug]Working on deployment file $DEPLOYMENT_FILE" - DEPLOYMENT_NAME=$(yq -r ".name" $DEPLOYMENT_FILE) - DEPLOYMENT_MODEL=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f2) - DEPLOYMENT_MODEL_VERSION=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f3) - # User can overwrite the version in the YAML - if [[ "${{ parameters.modelVersion }}" == "" ]]; then - TARGET_MODEL_VERSION=$DEPLOYMENT_MODEL_VERSION - else - echo "##[debug]Model being targeted is being overwriten with version ${{ parameters.modelVersion }}" - TARGET_MODEL_VERSION=${{ parameters.modelVersion }} - fi - - echo "##[debug]Working on deployment with name: $ENDPOINT_NAME/$DEPLOYMENT_NAME" - if [[ "$TARGET_MODEL_VERSION" == "current" ]]; then - echo "##[debug]Identifying current version of the model at deployment $ENDPOINT_NAME/$DEPLOYMENT_NAME" - MODEL_CURRENT_URL=$(az ml online-deployment show --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME -g ${{ parameters.resourceGroup }} -w ${{ parameters.workspaceName }} | jq -r ".model") - MODEL_CURRENT=$(basename $MODEL_CURRENT_URL) - echo "##[debug]Updating yaml files with current model version: $MODEL_CURRENT" - sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$MODEL_CURRENT'/' $DEPLOYMENT_FILE - fi - if [[ "$TARGET_MODEL_VERSION" == "latest" ]]; then - echo "##[debug]Identifying latest version of the model $DEPLOYMENT_MODEL" - MODEL_LATEST=$(az ml model list --name $DEPLOYMENT_MODEL -g ${{ parameters.resourceGroup }} -w ${{ parameters.workspaceName }} | jq -r '.[0].version') - - echo "##[debug]Updating yaml files with latest model version: $MODEL_LATEST" - sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$MODEL_LATEST'/' $DEPLOYMENT_FILE - fi - if [[ "$TARGET_MODEL_VERSION" == *=* ]]; then - echo "##[debug]Identifying version of the model $DEPLOYMENT_MODEL with tags $TARGET_MODEL_VERSION" - TARGET_MODEL_TAG=$(echo $TARGET_MODEL_VERSION | cut -d= -f1) - TARGET_MODEL_TVALUE=$(echo $TARGET_MODEL_VERSION | cut -d= -f2) - MODEL_TAGGED=$(az ml model list -n $DEPLOYMENT_MODEL | jq -r --arg TARGET_MODEL_TAG $TARGET_MODEL_TAG --arg TARGET_MODEL_TVALUE $TARGET_MODEL_TVALUE '.[] | select(.tags[$TARGET_MODEL_TAG] == $TARGET_MODEL_TVALUE) | .version') - echo "##[debug]Updating yaml files with model version: $MODEL_TAGGED" - sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$MODEL_TAGGED'/' $DEPLOYMENT_FILE - fi - - echo "##[debug]Creating deployment with name: $ENDPOINT_NAME/$DEPLOYMENT_NAME" - if ${{ lower(parameters.noWait) }}; then - az ml online-deployment create -f $DEPLOYMENT_FILE --only-show-errors --no-wait ${{ parameters.args }} - else - az ml online-deployment create -f $DEPLOYMENT_FILE --only-show-errors ${{ parameters.args }} - echo ##[debug]Configuring traffic for $ENDPOINT_NAME/$DEPLOYMENT_NAME" - TRAFFIC=$(yq --arg DEPLOYMENT_NAME $DEPLOYMENT_NAME '.traffic | select(has($DEPLOYMENT_NAME)) | .[]' $ENDPOINT_FILE) - if [[ -n $TRAFFIC ]]; then - echo "##[debug]Configuring traffic for $ENDPOINT_NAME/$DEPLOYMENT_NAME with value $TRAFFIC" - az ml online-endpoint update -n $ENDPOINT_NAME --traffic "$DEPLOYMENT_NAME=$TRAFFIC" - else - echo "##vso[task.logissue type=warning;sourcepath=$ENDPOINT_FILE;]Traffic is not indicated for $ENDPOINT_NAME/$DEPLOYMENT_NAME. You will have to configure traffic later." - fi - - echo "##[debug]Adquiring logs for deployment with name: $ENDPOINT_NAME/$DEPLOYMENT_NAME" - mkdir -p logs - az ml online-deployment get-logs --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME >> logs/$ENDPOINT_NAME_$DEPLOYMENT_NAME.log - fi - - echo "##[debug]Updating properties for deployment" - BRANCH_REF=$(Build.SourceBranch) - az ml online-deployment update --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --set tags.'Git commit'=$(Build.SourceVersion) - az ml online-deployment update --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --set tags.'Git branch'=${BRANCH_REF#refs/*/} - az ml online-deployment update --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --set tags.'Git repository'=$(Build.Repository.Uri) - - echo "##[debug]Deployment completed" - done - - if ${{ lower(parameters.secretsToKeyVault) }}; then - echo "##[debug]Uploading secrets to key vault ${{ parameters.keyVaultName }}" - az keyvault secret set --vault-name ${{ parameters.keyVaultName }} --name ${ENDPOINT_NAME//-/}ScoringUrl --value $SCORING_URI - az keyvault secret set --vault-name ${{ parameters.keyVaultName }} --name ${ENDPOINT_NAME//-/}ScoringKey --value $SCORING_KEY - fi - - echo "##[debug]Getting deployed version for model at file $DEPLOYMENT_FILE" - DEPLOYED_VERSION=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f3) - echo "##vso[task.setvariable variable=deployedVersion;isOutput=true]$DEPLOYED_VERSION" - echo "##[debug]Deployed version is: $DEPLOYED_VERSION" - - echo "##[debug]Endpoint evaluation completed" - done - - - task: AzureCLI@2 - name: collect_logs - displayName: Collecting deployment logs - inputs: - scriptType: bash - scriptLocation: inlineScript - azureSubscription: ${{ parameters.azureServiceConnectionName }} - inlineScript: | - echo "##[debug]Collecting deployment logs" - - ENDPOINT_FILES=$(find ${{ parameters.endpointFile }}) - for ENDPOINT_FILE in $ENDPOINT_FILES - do - ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) - - for DEPLOYMENT_FILE in $DEPLOYMENT_FILES - do - echo "##[debug]Working on deployment file $DEPLOYMENT_FILE" - DEPLOYMENT_NAME=$(yq -r ".name" $DEPLOYMENT_FILE) - - az ml online-deployment get-logs --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME >> ./logs/$ENDPOINT_NAME.log - done - done - - - task: PublishPipelineArtifact@1 - displayName: Uploading deployment logs - condition: and(succeeded(), eq('${{ parameters.noWait }}', 'false')) - inputs: - artifactName: deployment-logs +parameters: +- name: azureServiceConnectionName + type: string +- name: endpointFile + type: string +- name: deploymentFile + type: string +- name: modelVersion + type: string + default: +- name: workspaceName + type: string +- name: resourceGroup + type: string +- name: noWait + type: boolean + default: false +- name: args + type: string + default: +- name: secretsToKeyVault + type: boolean + default: false +- name: keyVaultName + type: string + default: + +steps: + - task: AzureCLI@2 + name: deployment + displayName: Deploying endpoint + inputs: + scriptType: bash + scriptLocation: inlineScript + azureSubscription: ${{ parameters.azureServiceConnectionName }} + inlineScript: | + set -e #Fail on errors + az configure --defaults workspace=${{ parameters.workspaceName }} group=${{ parameters.resourceGroup }} + ENDPOINT_FILES=$(find ${{ parameters.endpointFile }}) + for ENDPOINT_FILE in $ENDPOINT_FILES + do + ENDPOINT_FOLDER=$(dirname $ENDPOINT_FILE) + ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) + ENDPOINT_AUTH=$(yq -r ".auth_mode" $ENDPOINT_FILE) + # We are removing traffic key since this has the chicken and the egg issue. If you place .traffic you have + # to deploy the deployment first. But you can't deploy deployments without an endpoint. + echo "##[debug]Rewriting endpoint file without traffic" + yq -y "del(.traffic)" $ENDPOINT_FILE > $ENDPOINT_NAME.yml + echo "##[debug]Creating endpoint with name: $ENDPOINT_NAME" + if [[ $(az ml online-endpoint show -n $ENDPOINT_NAME) ]]; then + echo "##[debug]Endpoint $ENDPOINT_NAME already exits. Creation skipped." + if [[ $(az ml online-endpoint show -n $ENDPOINT_NAME | yq .auth_mode != "$ENDPOINT_AUTH") ]]; then + echo "##vso[task.logissue type=warning;sourcepath=$ENDPOINT_FILE;]Endpoint $ENDPOINT_NAME indicates a different authentication method that requires redeployment." + fi + else + az ml online-endpoint create -f $ENDPOINT_NAME.yml + fi + echo "##[debug]Retrieving URL and credentials" + SCORING_URI=$(az ml online-endpoint show -n $ENDPOINT_NAME | jq -r ".scoring_uri") + SCORING_KEY=$(az ml online-endpoint get-credentials -n $ENDPOINT_NAME -o tsv --query primaryKey) + + echo "##[debug]Looking for deployments in folder $ENDPOINT_FOLDER/${{ parameters.deploymentFile }}" + DEPLOYMENT_FILES=$(find $ENDPOINT_FOLDER/${{ parameters.deploymentFile }}) + + for DEPLOYMENT_FILE in $DEPLOYMENT_FILES + do + echo "##[debug]Working on deployment file $DEPLOYMENT_FILE" + DEPLOYMENT_NAME=$(yq -r ".name" $DEPLOYMENT_FILE) + DEPLOYMENT_MODEL=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f2) + DEPLOYMENT_MODEL_VERSION=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f3) + # User can overwrite the version in the YAML + if [[ "${{ parameters.modelVersion }}" == "" ]]; then + TARGET_MODEL_VERSION=$DEPLOYMENT_MODEL_VERSION + else + echo "##[debug]Model being targeted is being overwriten with version ${{ parameters.modelVersion }}" + TARGET_MODEL_VERSION=${{ parameters.modelVersion }} + fi + + echo "##[debug]Working on deployment with name: $ENDPOINT_NAME/$DEPLOYMENT_NAME" + if [[ "$TARGET_MODEL_VERSION" == "current" ]]; then + echo "##[debug]Identifying current version of the model at deployment $ENDPOINT_NAME/$DEPLOYMENT_NAME" + MODEL_CURRENT_URL=$(az ml online-deployment show --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME -g ${{ parameters.resourceGroup }} -w ${{ parameters.workspaceName }} | jq -r ".model") + MODEL_CURRENT=$(basename $MODEL_CURRENT_URL) + echo "##[debug]Updating yaml files with current model version: $MODEL_CURRENT" + sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$MODEL_CURRENT'/' $DEPLOYMENT_FILE + fi + if [[ "$TARGET_MODEL_VERSION" == "latest" ]]; then + echo "##[debug]Identifying latest version of the model $DEPLOYMENT_MODEL" + MODEL_LATEST=$(az ml model list --name $DEPLOYMENT_MODEL -g ${{ parameters.resourceGroup }} -w ${{ parameters.workspaceName }} | jq -r '.[0].version') + + echo "##[debug]Updating yaml files with latest model version: $MODEL_LATEST" + sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$MODEL_LATEST'/' $DEPLOYMENT_FILE + fi + if [[ "$TARGET_MODEL_VERSION" == *=* ]]; then + echo "##[debug]Identifying version of the model $DEPLOYMENT_MODEL with tags $TARGET_MODEL_VERSION" + TARGET_MODEL_TAG=$(echo $TARGET_MODEL_VERSION | cut -d= -f1) + TARGET_MODEL_TVALUE=$(echo $TARGET_MODEL_VERSION | cut -d= -f2) + MODEL_TAGGED=$(az ml model list -n $DEPLOYMENT_MODEL | jq -r --arg TARGET_MODEL_TAG $TARGET_MODEL_TAG --arg TARGET_MODEL_TVALUE $TARGET_MODEL_TVALUE '.[] | select(.tags[$TARGET_MODEL_TAG] == $TARGET_MODEL_TVALUE) | .version') + echo "##[debug]Updating yaml files with model version: $MODEL_TAGGED" + sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$MODEL_TAGGED'/' $DEPLOYMENT_FILE + fi + + echo "##[debug]Creating deployment with name: $ENDPOINT_NAME/$DEPLOYMENT_NAME" + if ${{ lower(parameters.noWait) }}; then + az ml online-deployment create -f $DEPLOYMENT_FILE --only-show-errors --no-wait ${{ parameters.args }} + else + az ml online-deployment create -f $DEPLOYMENT_FILE --only-show-errors ${{ parameters.args }} + echo ##[debug]Configuring traffic for $ENDPOINT_NAME/$DEPLOYMENT_NAME" + TRAFFIC=$(yq --arg DEPLOYMENT_NAME $DEPLOYMENT_NAME '.traffic | select(has($DEPLOYMENT_NAME)) | .[]' $ENDPOINT_FILE) + if [[ -n $TRAFFIC ]]; then + echo "##[debug]Configuring traffic for $ENDPOINT_NAME/$DEPLOYMENT_NAME with value $TRAFFIC" + az ml online-endpoint update -n $ENDPOINT_NAME --traffic "$DEPLOYMENT_NAME=$TRAFFIC" + else + echo "##vso[task.logissue type=warning;sourcepath=$ENDPOINT_FILE;]Traffic is not indicated for $ENDPOINT_NAME/$DEPLOYMENT_NAME. You will have to configure traffic later." + fi + + echo "##[debug]Adquiring logs for deployment with name: $ENDPOINT_NAME/$DEPLOYMENT_NAME" + mkdir -p logs + az ml online-deployment get-logs --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME >> logs/$ENDPOINT_NAME_$DEPLOYMENT_NAME.log + fi + + echo "##[debug]Updating properties for deployment" + BRANCH_REF=$(Build.SourceBranch) + az ml online-deployment update --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --set tags.'Git commit'=$(Build.SourceVersion) + az ml online-deployment update --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --set tags.'Git branch'=${BRANCH_REF#refs/*/} + az ml online-deployment update --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --set tags.'Git repository'=$(Build.Repository.Uri) + + echo "##[debug]Deployment completed" + done + + if ${{ lower(parameters.secretsToKeyVault) }}; then + echo "##[debug]Uploading secrets to key vault ${{ parameters.keyVaultName }}" + az keyvault secret set --vault-name ${{ parameters.keyVaultName }} --name ${ENDPOINT_NAME//-/}ScoringUrl --value $SCORING_URI + az keyvault secret set --vault-name ${{ parameters.keyVaultName }} --name ${ENDPOINT_NAME//-/}ScoringKey --value $SCORING_KEY + fi + + echo "##[debug]Getting deployed version for model at file $DEPLOYMENT_FILE" + DEPLOYED_VERSION=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f3) + echo "##vso[task.setvariable variable=deployedVersion;isOutput=true]$DEPLOYED_VERSION" + echo "##[debug]Deployed version is: $DEPLOYED_VERSION" + + echo "##[debug]Endpoint evaluation completed" + done + + - task: AzureCLI@2 + name: collect_logs + displayName: Collecting deployment logs + inputs: + scriptType: bash + scriptLocation: inlineScript + azureSubscription: ${{ parameters.azureServiceConnectionName }} + inlineScript: | + echo "##[debug]Collecting deployment logs" + + ENDPOINT_FILES=$(find ${{ parameters.endpointFile }}) + for ENDPOINT_FILE in $ENDPOINT_FILES + do + ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) + + for DEPLOYMENT_FILE in $DEPLOYMENT_FILES + do + echo "##[debug]Working on deployment file $DEPLOYMENT_FILE" + DEPLOYMENT_NAME=$(yq -r ".name" $DEPLOYMENT_FILE) + + az ml online-deployment get-logs --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME >> ./logs/$ENDPOINT_NAME.log + done + done + + - task: PublishPipelineArtifact@1 + displayName: Uploading deployment logs + condition: and(succeeded(), eq('${{ parameters.noWait }}', 'false')) + inputs: + artifactName: deployment-logs targetPath: logs \ No newline at end of file diff --git a/MLOps-ADO-ADB/.azure_pipelines/templates/aml-job-create/step.yml b/MLOps-ADO-ADB/.azure_pipelines/templates/aml-job-create/step.yml index 2d4d6c17..da65c1a1 100644 --- a/MLOps-ADO-ADB/.azure_pipelines/templates/aml-job-create/step.yml +++ b/MLOps-ADO-ADB/.azure_pipelines/templates/aml-job-create/step.yml @@ -1,44 +1,44 @@ -parameters: -- name: azureServiceConnectionName - type: string -- name: name - type: string -- name: jobFile - type: string -- name: workspaceName - type: string -- name: resourceGroup - type: string -- name: noWait - type: boolean - default: false -- name: stepDisplayName - type: string - default: Submitting job - -steps: - - task: AzureCLI@2 - name: jobRun - displayName: ${{ parameters.stepDisplayName }} - inputs: - scriptType: bash - scriptLocation: inlineScript - azureSubscription: ${{ parameters.azureServiceConnectionName }} - inlineScript: | - JOB_NAME="${{ parameters.name }}-$(Build.BuildId)" - echo "##[debug]Creating job with name: $JOB_NAME" - echo "##vso[task.setvariable variable=jobName;isOutput=true]$JOB_NAME" - if ${{ lower(parameters.noWait) }}; then - az ml job create -n $JOB_NAME -f ${{ parameters.jobFile }} --resource-group ${{ parameters.resourceGroup }} --workspace-name ${{ parameters.workspaceName }} - else - az ml job create -n $JOB_NAME -f ${{ parameters.jobFile }} --resource-group ${{ parameters.resourceGroup }} --workspace-name ${{ parameters.workspaceName }} --stream >> job.log - fi - target: - settableVariables: - - jobName - - task: PublishPipelineArtifact@1 - displayName: Uploading job logs - condition: and(succeededOrFailed(), and(eq('${{ parameters.noWait }}', 'false'), ne(variables['jobRun.jobName'], ''))) - inputs: - artifactName: ${{ parameters.name }}-log +parameters: +- name: azureServiceConnectionName + type: string +- name: name + type: string +- name: jobFile + type: string +- name: workspaceName + type: string +- name: resourceGroup + type: string +- name: noWait + type: boolean + default: false +- name: stepDisplayName + type: string + default: Submitting job + +steps: + - task: AzureCLI@2 + name: jobRun + displayName: ${{ parameters.stepDisplayName }} + inputs: + scriptType: bash + scriptLocation: inlineScript + azureSubscription: ${{ parameters.azureServiceConnectionName }} + inlineScript: | + JOB_NAME="${{ parameters.name }}-$(Build.BuildId)" + echo "##[debug]Creating job with name: $JOB_NAME" + echo "##vso[task.setvariable variable=jobName;isOutput=true]$JOB_NAME" + if ${{ lower(parameters.noWait) }}; then + az ml job create -n $JOB_NAME -f ${{ parameters.jobFile }} --resource-group ${{ parameters.resourceGroup }} --workspace-name ${{ parameters.workspaceName }} + else + az ml job create -n $JOB_NAME -f ${{ parameters.jobFile }} --resource-group ${{ parameters.resourceGroup }} --workspace-name ${{ parameters.workspaceName }} --stream >> job.log + fi + target: + settableVariables: + - jobName + - task: PublishPipelineArtifact@1 + displayName: Uploading job logs + condition: and(succeededOrFailed(), and(eq('${{ parameters.noWait }}', 'false'), ne(variables['jobRun.jobName'], ''))) + inputs: + artifactName: ${{ parameters.name }}-log targetPath: job.log \ No newline at end of file diff --git a/MLOps-ADO-ADB/.azure_pipelines/templates/aml-model-register/step.yml b/MLOps-ADO-ADB/.azure_pipelines/templates/aml-model-register/step.yml index 75595b1c..633f4110 100644 --- a/MLOps-ADO-ADB/.azure_pipelines/templates/aml-model-register/step.yml +++ b/MLOps-ADO-ADB/.azure_pipelines/templates/aml-model-register/step.yml @@ -1,41 +1,41 @@ -parameters: -- name: azureServiceConnectionName - type: string -- name: name - type: string -- name: modelPath - type: string -- name: workspaceName - type: string -- name: resourceGroup - type: string -- name: azureServiceConnectionNameTarget - type: string -- name: workspaceNameTarget - type: string -- name: resourceGroupTarget - type: string -steps: - - task: AzureCLI@2 - displayName: Downloading model - inputs: - scriptType: bash - scriptLocation: inlineScript - azureSubscription: ${{ parameters.azureServiceConnectionName }} - workingDirectory: $(System.DefaultWorkingDirectory) - inlineScript: | - MODEL_LATEST=$(az ml model list --name ${{ parameters.name }} -g ${{ parameters.resourceGroup }} -w ${{ parameters.workspaceName }} | jq -r '.[0].version') - az ml model download --name ${{ parameters.name }} --version $MODEL_LATEST --resource-group ${{ parameters.resourceGroup }} --workspace-name ${{ parameters.workspaceName }} - - task: AzureCLI@2 - displayName: Registering model - inputs: - scriptType: bash - scriptLocation: inlineScript - azureSubscription: ${{ parameters.azureServiceConnectionNameTarget }} - workingDirectory: $(System.DefaultWorkingDirectory) - inlineScript: | - az ml model create --name "${{ parameters.name }}" --path "${{ parameters.name}}/${{ parameters.modelPath }}" --resource-group ${{ parameters.resourceGroupTarget }} --workspace-name ${{ parameters.workspaceNameTarget }} - - - +parameters: +- name: azureServiceConnectionName + type: string +- name: name + type: string +- name: modelPath + type: string +- name: workspaceName + type: string +- name: resourceGroup + type: string +- name: azureServiceConnectionNameTarget + type: string +- name: workspaceNameTarget + type: string +- name: resourceGroupTarget + type: string +steps: + - task: AzureCLI@2 + displayName: Downloading model + inputs: + scriptType: bash + scriptLocation: inlineScript + azureSubscription: ${{ parameters.azureServiceConnectionName }} + workingDirectory: $(System.DefaultWorkingDirectory) + inlineScript: | + MODEL_LATEST=$(az ml model list --name ${{ parameters.name }} -g ${{ parameters.resourceGroup }} -w ${{ parameters.workspaceName }} | jq -r '.[0].version') + az ml model download --name ${{ parameters.name }} --version $MODEL_LATEST --resource-group ${{ parameters.resourceGroup }} --workspace-name ${{ parameters.workspaceName }} + - task: AzureCLI@2 + displayName: Registering model + inputs: + scriptType: bash + scriptLocation: inlineScript + azureSubscription: ${{ parameters.azureServiceConnectionNameTarget }} + workingDirectory: $(System.DefaultWorkingDirectory) + inlineScript: | + az ml model create --name "${{ parameters.name }}" --path "${{ parameters.name}}/${{ parameters.modelPath }}" --resource-group ${{ parameters.resourceGroupTarget }} --workspace-name ${{ parameters.workspaceNameTarget }} + + + \ No newline at end of file diff --git a/MLOps-ADO-ADB/.azure_pipelines/templates/aml-model-register/step2.yml b/MLOps-ADO-ADB/.azure_pipelines/templates/aml-model-register/step2.yml index 4c117f64..a41db5de 100644 --- a/MLOps-ADO-ADB/.azure_pipelines/templates/aml-model-register/step2.yml +++ b/MLOps-ADO-ADB/.azure_pipelines/templates/aml-model-register/step2.yml @@ -1,54 +1,54 @@ -parameters: -- name: azureServiceConnectionName - type: string -- name: name - type: string -- name: description - type: string - default: -- name: fromJob - type: boolean - default: false -- name: fromAnotherWorkspace - type: boolean - default: false -- name: jobName - type: string - default: -- name: modelPath - type: string -- name: modelType - type: string - default: custom_model -- name: workspaceName - type: string - default: -- name: resourceGroup - type: string - default: -steps: - - task: AzureCLI@2 - displayName: Registering model - inputs: - scriptType: bash - scriptLocation: inlineScript - azureSubscription: ${{ parameters.azureServiceConnectionName }} - inlineScript: | - if ${{ lower(parameters.fromJob) }}; then - if ${{ lower(parameters.fromAnotherWorkspace) }}; then - echo "##[debug]Downloading assets from job ${{ parameters.jobName }}" - az ml job download --name ${{ parameters.jobName }} --resource-group ${{ parameters.resourceGroup }} --workspace-name ${{ parameters.workspaceName }} - echo "##[debug]$(ls)" - MODEL_PATH="${{ parameters.jobName }}/${{ parameters.modelPath }}" - else - MODEL_PATH="azureml://jobs/${{ parameters.jobName }}/outputs/artifacts/${{ parameters.modelPath }}" - fi - else - if test -f "${{ parameters.jobName }}/${{ parameters.modelPath }}"; then - echo "##vso[task.logissue type=error]File ${{ parameters.modelPath }} not found." - exit 1 - fi - MODEL_PATH="${{ parameters.modelPath }}" - fi - echo "##[debug]Creating model from path $MODEL_PATH" - az ml model create --name "${{ parameters.name }}" --description "${{ parameters.description }}" --type "${{ parameters.modelType }}" --path $MODEL_PATH --resource-group ${{ parameters.resourceGroup }} --workspace-name ${{ parameters.workspaceName }} +parameters: +- name: azureServiceConnectionName + type: string +- name: name + type: string +- name: description + type: string + default: +- name: fromJob + type: boolean + default: false +- name: fromAnotherWorkspace + type: boolean + default: false +- name: jobName + type: string + default: +- name: modelPath + type: string +- name: modelType + type: string + default: custom_model +- name: workspaceName + type: string + default: +- name: resourceGroup + type: string + default: +steps: + - task: AzureCLI@2 + displayName: Registering model + inputs: + scriptType: bash + scriptLocation: inlineScript + azureSubscription: ${{ parameters.azureServiceConnectionName }} + inlineScript: | + if ${{ lower(parameters.fromJob) }}; then + if ${{ lower(parameters.fromAnotherWorkspace) }}; then + echo "##[debug]Downloading assets from job ${{ parameters.jobName }}" + az ml job download --name ${{ parameters.jobName }} --resource-group ${{ parameters.resourceGroup }} --workspace-name ${{ parameters.workspaceName }} + echo "##[debug]$(ls)" + MODEL_PATH="${{ parameters.jobName }}/${{ parameters.modelPath }}" + else + MODEL_PATH="azureml://jobs/${{ parameters.jobName }}/outputs/artifacts/${{ parameters.modelPath }}" + fi + else + if test -f "${{ parameters.jobName }}/${{ parameters.modelPath }}"; then + echo "##vso[task.logissue type=error]File ${{ parameters.modelPath }} not found." + exit 1 + fi + MODEL_PATH="${{ parameters.modelPath }}" + fi + echo "##[debug]Creating model from path $MODEL_PATH" + az ml model create --name "${{ parameters.name }}" --description "${{ parameters.description }}" --type "${{ parameters.modelType }}" --path $MODEL_PATH --resource-group ${{ parameters.resourceGroup }} --workspace-name ${{ parameters.workspaceName }} diff --git a/MLOps-ADO-ADB/.azure_pipelines/variables.yml b/MLOps-ADO-ADB/.azure_pipelines/variables.yml index 9dd5e9ad..ecc41b4c 100644 --- a/MLOps-ADO-ADB/.azure_pipelines/variables.yml +++ b/MLOps-ADO-ADB/.azure_pipelines/variables.yml @@ -1,10 +1,10 @@ -variables: - azureServiceConnection: "mlws-sp" - jobname: "auto-ml-train" - workspace: "mlws" - resourcegroup: "trial" - name: "regmodel" - azureServiceConnectionProd: "mcw-sp" - workspaceProd: "quick-start-ws" - resourcegroupProd: "MCW-MLOps" +variables: + azureServiceConnection: "mlws-sp" + jobname: "auto-ml-train" + workspace: "mlws" + resourcegroup: "trial" + name: "regmodel" + azureServiceConnectionProd: "mcw-sp" + workspaceProd: "quick-start-ws" + resourcegroupProd: "MCW-MLOps" modelPath: "model" \ No newline at end of file diff --git a/MLOps-ADO-ADB/.azure_pipelines/workshop_unit_test.yml b/MLOps-ADO-ADB/.azure_pipelines/workshop_unit_test.yml index c13a5e8a..147b3d51 100644 --- a/MLOps-ADO-ADB/.azure_pipelines/workshop_unit_test.yml +++ b/MLOps-ADO-ADB/.azure_pipelines/workshop_unit_test.yml @@ -1,146 +1,146 @@ -# Azure DevOps Pipeline to Run a Databricks Job -# This uses bash scripts to invoke the Databricks API and start a job. -# First we use the service principal's credentials to get a token from Entra -# Then we use that token to make an HTTP call to the Databricks API. - -# When we run the notebook, we want to pull the latest version of it from the AzDO repo. -# To do this, we would like to use the SP's credentials to pull the files from Git. -# AzDO now supports SP's connecting to repos... but Databricks does not yet support -# this in their git client. Therefore, we still have to use a PAT for a regular -# AzDO user. :-( - -# This pipeline expects the following variables: -# - tenant_id: The ID of your Entra tenant (should be a guid) -# - sp_client_id: The service principal's client ID (should be a guid) -# - sp_credential: The service principal's credential (should be marked as a secret) -# - databricks_workspace_uri: The URI for the Databricks workspace (without the trailing slash) -# - ado_username: username for Azure DevOps with repo access to share with service principal -# - ado_username_pat: ADO personal_access_token for username - -trigger: - branches: - exclude: - - main - - integration - paths: - include: - - src/workshop/notebooks/part_1_1_data_prep.ipynb - - src/workshop/notebooks/test_params.py - - .azure_pipelines/workshop_unit_test.yml - -pool: - vmImage: ubuntu-latest - -variables: - - group: mlops-ado-adb-variables - - name: BRANCH_NAME - value: $[replace(variables['Build.SourceBranch'], 'refs/heads/', '')] - -steps: -- script: | - token=$(curl -s -X POST -H 'Content-Type: application/x-www-form-urlencoded' \ - https://login.microsoftonline.com/$(tenant_id)/oauth2/v2.0/token \ - -d 'client_id=$(sp_client_id)' \ - -d 'grant_type=client_credentials' \ - -d 'scope=2ff814a6-3304-4ab8-85cb-cd0e6f879c1d%2F.default' \ - -d 'client_secret='"$SP_CREDENTIAL"'' \ - | jq -r '.access_token') - - echo "##vso[task.setvariable variable=token;issecret=true]$token" - - displayName: 'Get Entra ID token' - env: - SP_CREDENTIAL: $(sp_credential) - -- script: | - result=$(curl -s -X GET \ - -H 'Authorization: Bearer '"$(token)"'' \ - $(databricks_workspace_uri)/api/2.0/git-credentials) - - for cred in $(echo "${result}" | jq -c '.credentials[] | {credential_id}'); do - echo "Deleting credentials" - echo $cred - cred_id=$(echo $cred | jq -r .credential_id) - del_result=$(curl -s -X DELETE \ - -H 'Authorization: Bearer '"$(token)"'' \ - $(databricks_workspace_uri)/api/2.0/git-credentials/${cred_id}) - done - - result=$(curl -s -X POST \ - -H 'Authorization: Bearer '"$(token)"'' \ - -H 'Content-Type: application/json' \ - -d '{ - "git_provider": "AzureDevOpsServices", - "personal_access_token": "$(ado_username_pat)", - "git_username": "$(ado_username)" - }' \ - $(databricks_workspace_uri)/api/2.0/git-credentials) - - echo $result - - displayName: 'Refresh Git Credentials' - -- script: | - cluster_def='{ - "spark_version": "13.2.x-cpu-ml-scala2.12", - "spark_conf": { - "spark.databricks.delta.preview.enabled": "true", - "spark.master": "local[*, 4]", - "spark.databricks.cluster.profile": "singleNode" - }, - "azure_attributes": { - "first_on_demand": 1, - "availability": "ON_DEMAND_AZURE", - "spot_bid_max_price": -1 - }, - "node_type_id": "Standard_D4a_v4", - "driver_node_type_id": "Standard_D4a_v4", - "custom_tags": { - "ResourceClass": "SingleNode" - }, - "spark_env_vars": { - "PYSPARK_PYTHON": "/databricks/python3/bin/python3" - }, - "enable_elastic_disk": true, - "data_security_mode": "LEGACY_SINGLE_USER_STANDARD", - "runtime_engine": "STANDARD", - "num_workers": 0 - }' - - result=$(curl -s -X POST \ - -H 'Authorization: Bearer '"$(token)"'' \ - -H 'Content-Type: application/json' \ - -d '{ - "run_name": "Data Prep Unit Test Pipeline - '"$(BRANCH_NAME)"'", - "tasks": [ - { - "task_key": "data_prep", - "notebook_task": { - "notebook_path": "src/workshop/notebooks/part_1_1_data_prep", - "source": "GIT", - "base_parameters": { - "run_name": "'"$(BRANCH_NAME)"'" - } - }, - "new_cluster": '"$cluster_def"' - } - ], - "git_source": { - "git_provider": "azureDevOpsServices", - "git_url": "'"$(System.CollectionUri)$(System.TeamProject)"/_git/"$(Build.Repository.Name)"'", - "git_branch": "'"$(BRANCH_NAME)"'" - }, - "access_control_list": [ - { - "group_name": "users", - "permission_level": "CAN_VIEW" - } - ] - }' \ - $(databricks_workspace_uri)/api/2.1/jobs/runs/submit) - - echo Using Git URL: "'"$(System.CollectionUri)$(System.TeamProject)"/_git/"$(Build.Repository.Name)"'" - - echo $result - - displayName: 'Run Databricks notebook via API' +# Azure DevOps Pipeline to Run a Databricks Job +# This uses bash scripts to invoke the Databricks API and start a job. +# First we use the service principal's credentials to get a token from Entra +# Then we use that token to make an HTTP call to the Databricks API. + +# When we run the notebook, we want to pull the latest version of it from the AzDO repo. +# To do this, we would like to use the SP's credentials to pull the files from Git. +# AzDO now supports SP's connecting to repos... but Databricks does not yet support +# this in their git client. Therefore, we still have to use a PAT for a regular +# AzDO user. :-( + +# This pipeline expects the following variables: +# - tenant_id: The ID of your Entra tenant (should be a guid) +# - sp_client_id: The service principal's client ID (should be a guid) +# - sp_credential: The service principal's credential (should be marked as a secret) +# - databricks_workspace_uri: The URI for the Databricks workspace (without the trailing slash) +# - ado_username: username for Azure DevOps with repo access to share with service principal +# - ado_username_pat: ADO personal_access_token for username + +trigger: + branches: + exclude: + - main + - integration + paths: + include: + - src/workshop/notebooks/part_1_1_data_prep.ipynb + - src/workshop/notebooks/test_params.py + - .azure_pipelines/workshop_unit_test.yml + +pool: + vmImage: ubuntu-latest + +variables: + - group: mlops-ado-adb-variables + - name: BRANCH_NAME + value: $[replace(variables['Build.SourceBranch'], 'refs/heads/', '')] + +steps: +- script: | + token=$(curl -s -X POST -H 'Content-Type: application/x-www-form-urlencoded' \ + https://login.microsoftonline.com/$(tenant_id)/oauth2/v2.0/token \ + -d 'client_id=$(sp_client_id)' \ + -d 'grant_type=client_credentials' \ + -d 'scope=2ff814a6-3304-4ab8-85cb-cd0e6f879c1d%2F.default' \ + -d 'client_secret='"$SP_CREDENTIAL"'' \ + | jq -r '.access_token') + + echo "##vso[task.setvariable variable=token;issecret=true]$token" + + displayName: 'Get Entra ID token' + env: + SP_CREDENTIAL: $(sp_credential) + +- script: | + result=$(curl -s -X GET \ + -H 'Authorization: Bearer '"$(token)"'' \ + $(databricks_workspace_uri)/api/2.0/git-credentials) + + for cred in $(echo "${result}" | jq -c '.credentials[] | {credential_id}'); do + echo "Deleting credentials" + echo $cred + cred_id=$(echo $cred | jq -r .credential_id) + del_result=$(curl -s -X DELETE \ + -H 'Authorization: Bearer '"$(token)"'' \ + $(databricks_workspace_uri)/api/2.0/git-credentials/${cred_id}) + done + + result=$(curl -s -X POST \ + -H 'Authorization: Bearer '"$(token)"'' \ + -H 'Content-Type: application/json' \ + -d '{ + "git_provider": "AzureDevOpsServices", + "personal_access_token": "$(ado_username_pat)", + "git_username": "$(ado_username)" + }' \ + $(databricks_workspace_uri)/api/2.0/git-credentials) + + echo $result + + displayName: 'Refresh Git Credentials' + +- script: | + cluster_def='{ + "spark_version": "13.2.x-cpu-ml-scala2.12", + "spark_conf": { + "spark.databricks.delta.preview.enabled": "true", + "spark.master": "local[*, 4]", + "spark.databricks.cluster.profile": "singleNode" + }, + "azure_attributes": { + "first_on_demand": 1, + "availability": "ON_DEMAND_AZURE", + "spot_bid_max_price": -1 + }, + "node_type_id": "Standard_D4a_v4", + "driver_node_type_id": "Standard_D4a_v4", + "custom_tags": { + "ResourceClass": "SingleNode" + }, + "spark_env_vars": { + "PYSPARK_PYTHON": "/databricks/python3/bin/python3" + }, + "enable_elastic_disk": true, + "data_security_mode": "LEGACY_SINGLE_USER_STANDARD", + "runtime_engine": "STANDARD", + "num_workers": 0 + }' + + result=$(curl -s -X POST \ + -H 'Authorization: Bearer '"$(token)"'' \ + -H 'Content-Type: application/json' \ + -d '{ + "run_name": "Data Prep Unit Test Pipeline - '"$(BRANCH_NAME)"'", + "tasks": [ + { + "task_key": "data_prep", + "notebook_task": { + "notebook_path": "src/workshop/notebooks/part_1_1_data_prep", + "source": "GIT", + "base_parameters": { + "run_name": "'"$(BRANCH_NAME)"'" + } + }, + "new_cluster": '"$cluster_def"' + } + ], + "git_source": { + "git_provider": "azureDevOpsServices", + "git_url": "'"$(System.CollectionUri)$(System.TeamProject)"/_git/"$(Build.Repository.Name)"'", + "git_branch": "'"$(BRANCH_NAME)"'" + }, + "access_control_list": [ + { + "group_name": "users", + "permission_level": "CAN_VIEW" + } + ] + }' \ + $(databricks_workspace_uri)/api/2.1/jobs/runs/submit) + + echo Using Git URL: "'"$(System.CollectionUri)$(System.TeamProject)"/_git/"$(Build.Repository.Name)"'" + + echo $result + + displayName: 'Run Databricks notebook via API' diff --git a/MLOps-ADO-ADB/.github/actions/aml-endpoint-deploy/action.yaml b/MLOps-ADO-ADB/.github/actions/aml-endpoint-deploy/action.yaml index f37c4b43..32f9ba29 100644 --- a/MLOps-ADO-ADB/.github/actions/aml-endpoint-deploy/action.yaml +++ b/MLOps-ADO-ADB/.github/actions/aml-endpoint-deploy/action.yaml @@ -1,126 +1,126 @@ -name: Deploy AzureML managed online endpoint -description: 'Deploys a model endpoint in Azure Machine Learning Services all along with all the deployments it contains. Logs are collected and uploaded.' - -inputs: - resourceGroup: - description: 'Name of the resource group where the workspace is placed.' - required: true - workspaceName: - description: 'Name of the workspace to work against.' - required: true - endpointFile: - description: 'Path to the endpoint YAML file.' - required: true - deploymentFile: - description: 'Path to the deployment YAML file for the given endpoint.' - required: true - modelVersion: - description: 'Model version you want to deploy. Supports either a specific version number, or "latest". If not specified, using the deployment file model version.' - required: false - default: '' - updateIfExists: - description: 'If endpoint exists, update it instead of creating a new one.' - required: false - default: 'false' -outputs: - deployedVersion: - description: 'Deployed version of the model' - value: ${{ steps.deployment.outputs.deployedVersion }} - -runs: - using: "composite" - steps: - - name: Deploy endpoint - id: deployment - shell: bash - run: | - set -e - az configure --defaults workspace=${{ inputs.workspaceName }} group=${{ inputs.resourceGroup }} - - ENDPOINT_FILE=${{ inputs.endpointFile }} - DEPLOYMENT_FILE=${{ inputs.deploymentFile }} - - ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) - echo "Endpoint name: $ENDPOINT_NAME" - - # Removing traffic if present in endpoint config as we'll manage traffic setup as part of the safe rollout - echo "Rewriting endpoint file without traffic" - yq -y -i "del(.traffic)" $ENDPOINT_FILE - - # Create or update endpoint - { - echo "Creating endpoint with name: $ENDPOINT_NAME" && - az ml online-endpoint create -f $ENDPOINT_FILE - } || { - echo "Endpoint $ENDPOINT_NAME already exists" - if [ ${{ inputs.updateIfExists }} == 'true' ]; then - echo "Updating endpoint with name: $ENDPOINT_NAME" && - az ml online-endpoint update -f $ENDPOINT_FILE - else - echo "Skipping update of endpoint with name: $ENDPOINT_NAME" - fi - } - - # Identify which slot should be used to stage this deployment based on current traffic - echo "Reading endpoint traffic to identify target staging deployment slot" - az ml online-endpoint show -n $ENDPOINT_NAME --query "traffic" -o yaml > _endpoint_traffic.yml - echo "Endpoint traffic:" - cat _endpoint_traffic.yml - GREEN_TRAFFIC=$(yq .green _endpoint_traffic.yml) - BLUE_TRAFFIC=$(yq .blue _endpoint_traffic.yml) - if [[ $GREEN_TRAFFIC == null || $GREEN_TRAFFIC == 0 ]]; then - STAGING_DEPLOYMENT_NAME='green'; - else - if [[ $BLUE_TRAFFIC == null || $BLUE_TRAFFIC == 0 ]]; then - STAGING_DEPLOYMENT_NAME='blue'; - else - echo "::error::No staging slots available for endpoint $ENDPOINT_NAME. One of the green/blue slots needs to have 0% traffic."; - exit 1; - fi - fi - echo "Selected staging deployment name: $STAGING_DEPLOYMENT_NAME" - - # Updating deployment file to setup name of deployment based on staging name selected above - echo "Updating deployment name to $STAGING_DEPLOYMENT_NAME" - if [[ $STAGING_DEPLOYMENT_NAME == "blue" ]]; then - yq -y -i '.name= "blue"' $DEPLOYMENT_FILE; - else - yq -y -i '.name= "green"' $DEPLOYMENT_FILE; - fi - - # Overwrite the model version set in the deployment file with a specific version or 'latest' if specified in the workflow - DEPLOYMENT_MODEL=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f2) - DEPLOYMENT_MODEL_VERSION=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f3) - if [ -z "${{ inputs.modelVersion}}" ]; then - TARGET_MODEL_VERSION=$DEPLOYMENT_MODEL_VERSION - else - echo "Model being targeted is being overwriten with version ${{ inputs.modelVersion}}" - TARGET_MODEL_VERSION=${{ inputs.modelVersion}} - fi - if [[ "$TARGET_MODEL_VERSION" == "latest" ]]; then - echo "Identifying latest version of the model $DEPLOYMENT_MODEL" - TARGET_MODEL_VERSION=$(az ml model list --name $DEPLOYMENT_MODEL | jq -r '.[0].version') - echo "Latest version of model $DEPLOYMENT_MODEL is $TARGET_MODEL_VERSION" - fi - if [[ $TARGET_MODEL_VERSION != $DEPLOYMENT_MODEL_VERSION ]]; then - echo "Updating deployment file with model version: $TARGET_MODEL_VERSION" - sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$TARGET_MODEL_VERSION'/' $DEPLOYMENT_FILE - fi - echo "::set-output name=deployedVersion::$TARGET_MODEL_VERSION" - - # Create deployment - echo "Creating deployment with name: $ENDPOINT_NAME/$STAGING_DEPLOYMENT_NAME" - az ml online-deployment create -f $DEPLOYMENT_FILE --only-show-errors --set tags.git_commit=${GITHUB_SHA} - echo "Deployment completed" - - # Saving logs - echo "Acquiring logs for deployment with name: $ENDPOINT_NAME/$STAGING_DEPLOYMENT_NAME" - mkdir -p logs - az ml online-deployment get-logs --name $STAGING_DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME >> logs/$ENDPOINT_NAME_$STAGING_DEPLOYMENT_NAME.log - - - name: Upload deployment logs - uses: actions/upload-artifact@v2 - if: ${{ (failure() || success()) }} - with: - name: deployment-logs +name: Deploy AzureML managed online endpoint +description: 'Deploys a model endpoint in Azure Machine Learning Services all along with all the deployments it contains. Logs are collected and uploaded.' + +inputs: + resourceGroup: + description: 'Name of the resource group where the workspace is placed.' + required: true + workspaceName: + description: 'Name of the workspace to work against.' + required: true + endpointFile: + description: 'Path to the endpoint YAML file.' + required: true + deploymentFile: + description: 'Path to the deployment YAML file for the given endpoint.' + required: true + modelVersion: + description: 'Model version you want to deploy. Supports either a specific version number, or "latest". If not specified, using the deployment file model version.' + required: false + default: '' + updateIfExists: + description: 'If endpoint exists, update it instead of creating a new one.' + required: false + default: 'false' +outputs: + deployedVersion: + description: 'Deployed version of the model' + value: ${{ steps.deployment.outputs.deployedVersion }} + +runs: + using: "composite" + steps: + - name: Deploy endpoint + id: deployment + shell: bash + run: | + set -e + az configure --defaults workspace=${{ inputs.workspaceName }} group=${{ inputs.resourceGroup }} + + ENDPOINT_FILE=${{ inputs.endpointFile }} + DEPLOYMENT_FILE=${{ inputs.deploymentFile }} + + ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) + echo "Endpoint name: $ENDPOINT_NAME" + + # Removing traffic if present in endpoint config as we'll manage traffic setup as part of the safe rollout + echo "Rewriting endpoint file without traffic" + yq -y -i "del(.traffic)" $ENDPOINT_FILE + + # Create or update endpoint + { + echo "Creating endpoint with name: $ENDPOINT_NAME" && + az ml online-endpoint create -f $ENDPOINT_FILE + } || { + echo "Endpoint $ENDPOINT_NAME already exists" + if [ ${{ inputs.updateIfExists }} == 'true' ]; then + echo "Updating endpoint with name: $ENDPOINT_NAME" && + az ml online-endpoint update -f $ENDPOINT_FILE + else + echo "Skipping update of endpoint with name: $ENDPOINT_NAME" + fi + } + + # Identify which slot should be used to stage this deployment based on current traffic + echo "Reading endpoint traffic to identify target staging deployment slot" + az ml online-endpoint show -n $ENDPOINT_NAME --query "traffic" -o yaml > _endpoint_traffic.yml + echo "Endpoint traffic:" + cat _endpoint_traffic.yml + GREEN_TRAFFIC=$(yq .green _endpoint_traffic.yml) + BLUE_TRAFFIC=$(yq .blue _endpoint_traffic.yml) + if [[ $GREEN_TRAFFIC == null || $GREEN_TRAFFIC == 0 ]]; then + STAGING_DEPLOYMENT_NAME='green'; + else + if [[ $BLUE_TRAFFIC == null || $BLUE_TRAFFIC == 0 ]]; then + STAGING_DEPLOYMENT_NAME='blue'; + else + echo "::error::No staging slots available for endpoint $ENDPOINT_NAME. One of the green/blue slots needs to have 0% traffic."; + exit 1; + fi + fi + echo "Selected staging deployment name: $STAGING_DEPLOYMENT_NAME" + + # Updating deployment file to setup name of deployment based on staging name selected above + echo "Updating deployment name to $STAGING_DEPLOYMENT_NAME" + if [[ $STAGING_DEPLOYMENT_NAME == "blue" ]]; then + yq -y -i '.name= "blue"' $DEPLOYMENT_FILE; + else + yq -y -i '.name= "green"' $DEPLOYMENT_FILE; + fi + + # Overwrite the model version set in the deployment file with a specific version or 'latest' if specified in the workflow + DEPLOYMENT_MODEL=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f2) + DEPLOYMENT_MODEL_VERSION=$(yq -r ".model" $DEPLOYMENT_FILE | cut -d: -f3) + if [ -z "${{ inputs.modelVersion}}" ]; then + TARGET_MODEL_VERSION=$DEPLOYMENT_MODEL_VERSION + else + echo "Model being targeted is being overwriten with version ${{ inputs.modelVersion}}" + TARGET_MODEL_VERSION=${{ inputs.modelVersion}} + fi + if [[ "$TARGET_MODEL_VERSION" == "latest" ]]; then + echo "Identifying latest version of the model $DEPLOYMENT_MODEL" + TARGET_MODEL_VERSION=$(az ml model list --name $DEPLOYMENT_MODEL | jq -r '.[0].version') + echo "Latest version of model $DEPLOYMENT_MODEL is $TARGET_MODEL_VERSION" + fi + if [[ $TARGET_MODEL_VERSION != $DEPLOYMENT_MODEL_VERSION ]]; then + echo "Updating deployment file with model version: $TARGET_MODEL_VERSION" + sed -i 's/:'$DEPLOYMENT_MODEL_VERSION'/:'$TARGET_MODEL_VERSION'/' $DEPLOYMENT_FILE + fi + echo "::set-output name=deployedVersion::$TARGET_MODEL_VERSION" + + # Create deployment + echo "Creating deployment with name: $ENDPOINT_NAME/$STAGING_DEPLOYMENT_NAME" + az ml online-deployment create -f $DEPLOYMENT_FILE --only-show-errors --set tags.git_commit=${GITHUB_SHA} + echo "Deployment completed" + + # Saving logs + echo "Acquiring logs for deployment with name: $ENDPOINT_NAME/$STAGING_DEPLOYMENT_NAME" + mkdir -p logs + az ml online-deployment get-logs --name $STAGING_DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME >> logs/$ENDPOINT_NAME_$STAGING_DEPLOYMENT_NAME.log + + - name: Upload deployment logs + uses: actions/upload-artifact@v2 + if: ${{ (failure() || success()) }} + with: + name: deployment-logs path: logs/* \ No newline at end of file diff --git a/MLOps-ADO-ADB/.github/actions/aml-endpoint-swap/action.yaml b/MLOps-ADO-ADB/.github/actions/aml-endpoint-swap/action.yaml index b3c948e2..0a9546c8 100644 --- a/MLOps-ADO-ADB/.github/actions/aml-endpoint-swap/action.yaml +++ b/MLOps-ADO-ADB/.github/actions/aml-endpoint-swap/action.yaml @@ -1,58 +1,58 @@ -name: Swap AzureML managed online endpoint deployments -description: 'Swaps green/blue deployments of an Azure ML endpoint by switching traffic around between endpoint deployments.' - -inputs: - resourceGroup: - description: 'Name of the resource group where the workspace is placed.' - required: true - workspaceName: - description: 'Name of the workspace to work against.' - required: true - endpointFile: - description: 'Path to the endpoint YAML file. Wildcard paths are supported which means that all matched endpoints will be deployed.' - required: true - -runs: - using: "composite" - steps: - - name: Swap endpoint deployments - id: swap-deployments - shell: bash - run: | - set -e - az configure --defaults workspace=${{ inputs.workspaceName }} group=${{ inputs.resourceGroup }} - - ENDPOINT_FILE=${{ inputs.endpointFile }} - ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) - echo "ENDPOINT_FILE: $ENDPOINT_FILE" - echo "ENDPOINT_NAME: $ENDPOINT_NAME" - - echo "Reading endpoint traffic to figure out which deployment is staging/production" - az ml online-endpoint show -n $ENDPOINT_NAME --query "traffic" -o yaml > endpoint_traffic.yml - echo "Endpoint traffic:" - cat endpoint_traffic.yml - GREEN_TRAFFIC=$(yq .green endpoint_traffic.yml) - BLUE_TRAFFIC=$(yq .blue endpoint_traffic.yml) - - if [ $GREEN_TRAFFIC == null ]; then - if [ $BLUE_TRAFFIC == null ]; then - echo "::error::No deployment slots available for endpoint $ENDPOINT_NAME. Nothing to swap."; - exit 1; - else - echo "Setting blue traffic to 100%" - az ml online-endpoint update -n $ENDPOINT_NAME --traffic "blue=100" - fi - else - if [ $BLUE_TRAFFIC == null ]; then - echo "Setting green traffic to 100%" - az ml online-endpoint update -n $ENDPOINT_NAME --traffic "green=100" - else - if [ $GREEN_TRAFFIC == 0 ]; then - echo "Setting traffic to: green=100 blue=0" - az ml online-endpoint update -n $ENDPOINT_NAME --traffic "green=100 blue=0" - else - echo "Setting traffic to: green=0 blue=100" - az ml online-endpoint update -n $ENDPOINT_NAME --traffic "green=0 blue=100" - fi - fi +name: Swap AzureML managed online endpoint deployments +description: 'Swaps green/blue deployments of an Azure ML endpoint by switching traffic around between endpoint deployments.' + +inputs: + resourceGroup: + description: 'Name of the resource group where the workspace is placed.' + required: true + workspaceName: + description: 'Name of the workspace to work against.' + required: true + endpointFile: + description: 'Path to the endpoint YAML file. Wildcard paths are supported which means that all matched endpoints will be deployed.' + required: true + +runs: + using: "composite" + steps: + - name: Swap endpoint deployments + id: swap-deployments + shell: bash + run: | + set -e + az configure --defaults workspace=${{ inputs.workspaceName }} group=${{ inputs.resourceGroup }} + + ENDPOINT_FILE=${{ inputs.endpointFile }} + ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) + echo "ENDPOINT_FILE: $ENDPOINT_FILE" + echo "ENDPOINT_NAME: $ENDPOINT_NAME" + + echo "Reading endpoint traffic to figure out which deployment is staging/production" + az ml online-endpoint show -n $ENDPOINT_NAME --query "traffic" -o yaml > endpoint_traffic.yml + echo "Endpoint traffic:" + cat endpoint_traffic.yml + GREEN_TRAFFIC=$(yq .green endpoint_traffic.yml) + BLUE_TRAFFIC=$(yq .blue endpoint_traffic.yml) + + if [ $GREEN_TRAFFIC == null ]; then + if [ $BLUE_TRAFFIC == null ]; then + echo "::error::No deployment slots available for endpoint $ENDPOINT_NAME. Nothing to swap."; + exit 1; + else + echo "Setting blue traffic to 100%" + az ml online-endpoint update -n $ENDPOINT_NAME --traffic "blue=100" + fi + else + if [ $BLUE_TRAFFIC == null ]; then + echo "Setting green traffic to 100%" + az ml online-endpoint update -n $ENDPOINT_NAME --traffic "green=100" + else + if [ $GREEN_TRAFFIC == 0 ]; then + echo "Setting traffic to: green=100 blue=0" + az ml online-endpoint update -n $ENDPOINT_NAME --traffic "green=100 blue=0" + else + echo "Setting traffic to: green=0 blue=100" + az ml online-endpoint update -n $ENDPOINT_NAME --traffic "green=0 blue=100" + fi + fi fi \ No newline at end of file diff --git a/MLOps-ADO-ADB/.github/actions/aml-endpoint-test/action.yaml b/MLOps-ADO-ADB/.github/actions/aml-endpoint-test/action.yaml index 82bd0859..db52f5a0 100644 --- a/MLOps-ADO-ADB/.github/actions/aml-endpoint-test/action.yaml +++ b/MLOps-ADO-ADB/.github/actions/aml-endpoint-test/action.yaml @@ -1,47 +1,47 @@ -name: Test AzureML managed online endpoint deployment (0% traffic deployment) -description: 'Finds 0% traffic deployment of an Azure ML endpoint and tests it.' - -inputs: - resourceGroup: - description: 'Name of the resource group where the workspace is placed.' - required: true - workspaceName: - description: 'Name of the workspace to work against.' - required: true - endpointFile: - description: 'Path to the endpoint YAML file. Wildcard paths are supported which means that all matched endpoints will be deployed.' - required: true - requestFile: - description: 'Name of the json test request file.' - required: true - -runs: - using: "composite" - steps: - - name: Test endpoint deployments - id: test-deployment - shell: bash - run: | - set -e - az configure --defaults workspace=${{ inputs.workspaceName }} group=${{ inputs.resourceGroup }} - - ENDPOINT_FILE=${{ inputs.endpointFile }} - ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) - echo "ENDPOINT_FILE: $ENDPOINT_FILE" - echo "ENDPOINT_NAME: $ENDPOINT_NAME" - - echo "Reading endpoint traffic to figure out which deployment is staging/production" - az ml online-endpoint show -n $ENDPOINT_NAME --query "traffic" -o yaml > endpoint_traffic.yml - echo "Endpoint traffic:" - cat endpoint_traffic.yml - GREEN_TRAFFIC=$(yq .green endpoint_traffic.yml) - BLUE_TRAFFIC=$(yq .blue endpoint_traffic.yml) - if [ $GREEN_TRAFFIC == 0 ]; then - TEST_DEPLOYMENT_NAME='green' - fi - if [ $BLUE_TRAFFIC == 0 ]; then - TEST_DEPLOYMENT_NAME='blue' - fi - - TEST_RESPONSE=$(az ml online-endpoint invoke -n $ENDPOINT_NAME --deployment $TEST_DEPLOYMENT_NAME --request-file ${{ inputs.requestFile }}) +name: Test AzureML managed online endpoint deployment (0% traffic deployment) +description: 'Finds 0% traffic deployment of an Azure ML endpoint and tests it.' + +inputs: + resourceGroup: + description: 'Name of the resource group where the workspace is placed.' + required: true + workspaceName: + description: 'Name of the workspace to work against.' + required: true + endpointFile: + description: 'Path to the endpoint YAML file. Wildcard paths are supported which means that all matched endpoints will be deployed.' + required: true + requestFile: + description: 'Name of the json test request file.' + required: true + +runs: + using: "composite" + steps: + - name: Test endpoint deployments + id: test-deployment + shell: bash + run: | + set -e + az configure --defaults workspace=${{ inputs.workspaceName }} group=${{ inputs.resourceGroup }} + + ENDPOINT_FILE=${{ inputs.endpointFile }} + ENDPOINT_NAME=$(yq -r ".name" $ENDPOINT_FILE) + echo "ENDPOINT_FILE: $ENDPOINT_FILE" + echo "ENDPOINT_NAME: $ENDPOINT_NAME" + + echo "Reading endpoint traffic to figure out which deployment is staging/production" + az ml online-endpoint show -n $ENDPOINT_NAME --query "traffic" -o yaml > endpoint_traffic.yml + echo "Endpoint traffic:" + cat endpoint_traffic.yml + GREEN_TRAFFIC=$(yq .green endpoint_traffic.yml) + BLUE_TRAFFIC=$(yq .blue endpoint_traffic.yml) + if [ $GREEN_TRAFFIC == 0 ]; then + TEST_DEPLOYMENT_NAME='green' + fi + if [ $BLUE_TRAFFIC == 0 ]; then + TEST_DEPLOYMENT_NAME='blue' + fi + + TEST_RESPONSE=$(az ml online-endpoint invoke -n $ENDPOINT_NAME --deployment $TEST_DEPLOYMENT_NAME --request-file ${{ inputs.requestFile }}) # TODO: test that response is valid, fail with exit 1 if not \ No newline at end of file diff --git a/MLOps-ADO-ADB/.github/actions/aml-job-create/action.yaml b/MLOps-ADO-ADB/.github/actions/aml-job-create/action.yaml index e3e0c019..09eb9631 100644 --- a/MLOps-ADO-ADB/.github/actions/aml-job-create/action.yaml +++ b/MLOps-ADO-ADB/.github/actions/aml-job-create/action.yaml @@ -1,58 +1,58 @@ -name: Submitting job -description: 'Creates and submit a new job to Azure ML based on a job configuration. Jobs are named using the provided job name and a unique run id returned by GitHub.' - -inputs: - # name: - # description: 'Name of the job to be created. Note that the final name of the job will be the given name followed by the number of the build run `github.run_id`. Thhis value is provided as an output.' - # required: true - jobFile: - description: 'Path to the job file.' - required: true - # workspaceName: - # description: 'Name of the workspace to work against.' - # required: true - # resourceGroup: - # description: 'Name of the resource group where the workspace is placed.' - # required: true - # noWait: - # description: 'Indicates if the action should not wait for the job to finish.' - # required: false - # default: 'false' - -# outputs: -# jobName: -# description: Name of the job name created in the workspace. -# value: ${{ steps.jobRun.outputs.jobName }} - -runs: - using: "composite" - steps: - - name: Run AML Job - id: jobRun - shell: bash - run: | - run_id=$(az ml job create -f ${{ inputs.jobFile }} --query name -o tsv) - if [[ -z "$run_id" ]] - then - echo "Job creation failed" - exit 3 - fi - az ml job show -n $run_id --web - status=$(az ml job show -n $run_id --query status -o tsv) - if [[ -z "$status" ]] - then - echo "Status query failed" - exit 4 - fi - running=("Queued" "Starting" "Preparing" "Running" "Finalizing") - while [[ ${running[*]} =~ $status ]] - do - sleep 15 - status=$(az ml job show -n $run_id --query status -o tsv) - echo $status - done - if [[ "$status" = "Failed" ]] - then - echo "Training Job failed" - exit 3 - fi +name: Submitting job +description: 'Creates and submit a new job to Azure ML based on a job configuration. Jobs are named using the provided job name and a unique run id returned by GitHub.' + +inputs: + # name: + # description: 'Name of the job to be created. Note that the final name of the job will be the given name followed by the number of the build run `github.run_id`. Thhis value is provided as an output.' + # required: true + jobFile: + description: 'Path to the job file.' + required: true + # workspaceName: + # description: 'Name of the workspace to work against.' + # required: true + # resourceGroup: + # description: 'Name of the resource group where the workspace is placed.' + # required: true + # noWait: + # description: 'Indicates if the action should not wait for the job to finish.' + # required: false + # default: 'false' + +# outputs: +# jobName: +# description: Name of the job name created in the workspace. +# value: ${{ steps.jobRun.outputs.jobName }} + +runs: + using: "composite" + steps: + - name: Run AML Job + id: jobRun + shell: bash + run: | + run_id=$(az ml job create -f ${{ inputs.jobFile }} --query name -o tsv) + if [[ -z "$run_id" ]] + then + echo "Job creation failed" + exit 3 + fi + az ml job show -n $run_id --web + status=$(az ml job show -n $run_id --query status -o tsv) + if [[ -z "$status" ]] + then + echo "Status query failed" + exit 4 + fi + running=("Queued" "Starting" "Preparing" "Running" "Finalizing") + while [[ ${running[*]} =~ $status ]] + do + sleep 15 + status=$(az ml job show -n $run_id --query status -o tsv) + echo $status + done + if [[ "$status" = "Failed" ]] + then + echo "Training Job failed" + exit 3 + fi diff --git a/MLOps-ADO-ADB/.github/workflows/workshop_cd.yml b/MLOps-ADO-ADB/.github/workflows/workshop_cd.yml index 4b8d1b48..03fec7d1 100644 --- a/MLOps-ADO-ADB/.github/workflows/workshop_cd.yml +++ b/MLOps-ADO-ADB/.github/workflows/workshop_cd.yml @@ -1,64 +1,64 @@ -name: workshop-cd -on: - workflow_dispatch: - pull_request: - types: - - opened - branches: - - main - paths: - - src/workshop/core/** - - .github/workflows/workshop_cd.yml -jobs: - Workshop-Deployment: - runs-on: ubuntu-latest - steps: - - - name: Check out repository code - uses: actions/checkout@v2 - - - name: Setup python - uses: actions/setup-python@v2 - with: - python-version: '3.8' - - - name: Upgrade pip - run: | - python -m pip install --upgrade pip - python -m pip install --upgrade build - python -m pip install --upgrade twine - - - name: AZ Login - uses: azure/login@v1 - with: - creds: ${{ secrets.AZURE_SERVICE_PRINCIPAL }} #setup replace AZURE_SERVICE_PRINCIPAL with the name of your Azure credentials secret in GitHub - - - name: Install az ml & and tools - run: | - az extension add -n ml -y --version 2.2.1 - sudo apt install jq - pip install yq - - - name: Run deployment - uses: ./.github/actions/aml-endpoint-deploy - with: - resourceGroup: njs-aia-rg #setup replace azureml with the name of your resource group in Azure - workspaceName: njs-ws #setup replace ws01ent with the name of your workspace in Azure - endpointFile: src/workshop/core/scoring/endpoint.yml - deploymentFile: src/workshop/core/scoring/deployment.yml - modelVersion: latest - - - name: Test deployment - uses: ./.github/actions/aml-endpoint-test - with: - resourceGroup: njs-aia-rg #setup replace azureml with the name of your resource group in Azure - workspaceName: njs-ws #setup replace ws01ent with the name of your workspace in Azure - endpointFile: src/workshop/core/scoring/endpoint.yml - requestFile: src/workshop/core/scoring/scoring_test_request.json - - - name: Swap deployment - uses: ./.github/actions/aml-endpoint-swap - with: - resourceGroup: njs-aia-rg #setup replace azureml with the name of your resource group in Azure - workspaceName: njs-ws #setup replace ws01ent with the name of your workspace in Azure +name: workshop-cd +on: + workflow_dispatch: + pull_request: + types: + - opened + branches: + - main + paths: + - src/workshop/core/** + - .github/workflows/workshop_cd.yml +jobs: + Workshop-Deployment: + runs-on: ubuntu-latest + steps: + + - name: Check out repository code + uses: actions/checkout@v2 + + - name: Setup python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + + - name: Upgrade pip + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade build + python -m pip install --upgrade twine + + - name: AZ Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_SERVICE_PRINCIPAL }} #setup replace AZURE_SERVICE_PRINCIPAL with the name of your Azure credentials secret in GitHub + + - name: Install az ml & and tools + run: | + az extension add -n ml -y --version 2.2.1 + sudo apt install jq + pip install yq + + - name: Run deployment + uses: ./.github/actions/aml-endpoint-deploy + with: + resourceGroup: njs-aia-rg #setup replace azureml with the name of your resource group in Azure + workspaceName: njs-ws #setup replace ws01ent with the name of your workspace in Azure + endpointFile: src/workshop/core/scoring/endpoint.yml + deploymentFile: src/workshop/core/scoring/deployment.yml + modelVersion: latest + + - name: Test deployment + uses: ./.github/actions/aml-endpoint-test + with: + resourceGroup: njs-aia-rg #setup replace azureml with the name of your resource group in Azure + workspaceName: njs-ws #setup replace ws01ent with the name of your workspace in Azure + endpointFile: src/workshop/core/scoring/endpoint.yml + requestFile: src/workshop/core/scoring/scoring_test_request.json + + - name: Swap deployment + uses: ./.github/actions/aml-endpoint-swap + with: + resourceGroup: njs-aia-rg #setup replace azureml with the name of your resource group in Azure + workspaceName: njs-ws #setup replace ws01ent with the name of your workspace in Azure endpointFile: src/workshop/core/scoring/endpoint.yml \ No newline at end of file diff --git a/MLOps-ADO-ADB/.github/workflows/workshop_ci.yml b/MLOps-ADO-ADB/.github/workflows/workshop_ci.yml index fd00c820..05ab49ed 100644 --- a/MLOps-ADO-ADB/.github/workflows/workshop_ci.yml +++ b/MLOps-ADO-ADB/.github/workflows/workshop_ci.yml @@ -1,47 +1,47 @@ -name: workshop-ci -on: - workflow_dispatch: - pull_request: - types: - - closed - branches: - - integration - paths: - - src/workshop/core/** - - .github/workflows/workshop_ci.yml -jobs: - Workshop-Train-Validation: - runs-on: ubuntu-latest - steps: - - name: Check out repository code - uses: actions/checkout@v3 - - name: Setup python - uses: actions/setup-python@v2 - with: - python-version: '3.8' # Version range or exact version of a Python version to use, using SemVer's version range syntax - - name: Upgrade pip - run: | - python -m pip install --upgrade pip - python -m pip install --upgrade build - python -m pip install --upgrade twine - - name: AZ Login - uses: azure/login@v1 - with: - creds: ${{ secrets.AZURE_SERVICE_PRINCIPAL }} #setup: provide your Azure credentials name stored in github - - - name: Install az ml & set default values for AML - run: | #setup: provide group, workspace and location - az extension add -n ml -y --version 2.2.1 - az configure --defaults group=njs-aia-rg workspace=njs-ws location=eastus - - name: run training and model validation - run: | - az ml job create -s -f src/workshop/core/pipelines/training_pipeline.yml - - - name: Create Pull Request to Main - uses: thomaseizinger/create-pull-request@master - with: - GITHUB_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN_GITHUB }} #setup: provide your github secret name - head: ${{ github.ref }} - base: main - title: "An automatically created PR to main by successful CI" - +name: workshop-ci +on: + workflow_dispatch: + pull_request: + types: + - closed + branches: + - integration + paths: + - src/workshop/core/** + - .github/workflows/workshop_ci.yml +jobs: + Workshop-Train-Validation: + runs-on: ubuntu-latest + steps: + - name: Check out repository code + uses: actions/checkout@v3 + - name: Setup python + uses: actions/setup-python@v2 + with: + python-version: '3.8' # Version range or exact version of a Python version to use, using SemVer's version range syntax + - name: Upgrade pip + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade build + python -m pip install --upgrade twine + - name: AZ Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_SERVICE_PRINCIPAL }} #setup: provide your Azure credentials name stored in github + + - name: Install az ml & set default values for AML + run: | #setup: provide group, workspace and location + az extension add -n ml -y --version 2.2.1 + az configure --defaults group=njs-aia-rg workspace=njs-ws location=eastus + - name: run training and model validation + run: | + az ml job create -s -f src/workshop/core/pipelines/training_pipeline.yml + + - name: Create Pull Request to Main + uses: thomaseizinger/create-pull-request@master + with: + GITHUB_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN_GITHUB }} #setup: provide your github secret name + head: ${{ github.ref }} + base: main + title: "An automatically created PR to main by successful CI" + diff --git a/MLOps-ADO-ADB/.github/workflows/workshop_unit_test.yml b/MLOps-ADO-ADB/.github/workflows/workshop_unit_test.yml index a99a1e6a..034da9a4 100644 --- a/MLOps-ADO-ADB/.github/workflows/workshop_unit_test.yml +++ b/MLOps-ADO-ADB/.github/workflows/workshop_unit_test.yml @@ -1,39 +1,39 @@ -name: feature_engineering_unit_test -on: - workflow_dispatch: - push: - branches-ignore: - - main - - integration - paths: - - src/workshop/core/data_engineering/* - - .github/workflows/workshop_unit_test.yml - -jobs: - unit-test: - runs-on: ubuntu-latest - steps: - - name: Check out repository code - uses: actions/checkout@v3 - - name: Setup python - uses: actions/setup-python@v2 - with: - python-version: '3.8' # Version range or exact version of a Python version to use, using SemVer's version range syntax - - name: Upgrade pip - run: | - python -m pip install --upgrade pip - python -m pip install --upgrade build - python -m pip install --upgrade twine - - name: AZ Login - uses: azure/login@v1 - with: - creds: ${{ secrets.AZURE_SERVICE_PRINCIPAL }} # SETUP: replace AZURE_SERVICE_PRINCIPAL with your own secret name - - name: Install AZ ML and tools - run: | # SETUP line 34 to point to your own AML workspace - az extension add -n ml -y --version 2.2.1 - az configure --defaults group=njs-aia-rg workspace=njs-ws location=eastus - - name: Run Feature Engineering - uses: ./.github/actions/aml-job-create - with: - jobFile: src/workshop/core/data_engineering/feature_engineering.yml - +name: feature_engineering_unit_test +on: + workflow_dispatch: + push: + branches-ignore: + - main + - integration + paths: + - src/workshop/core/data_engineering/* + - .github/workflows/workshop_unit_test.yml + +jobs: + unit-test: + runs-on: ubuntu-latest + steps: + - name: Check out repository code + uses: actions/checkout@v3 + - name: Setup python + uses: actions/setup-python@v2 + with: + python-version: '3.8' # Version range or exact version of a Python version to use, using SemVer's version range syntax + - name: Upgrade pip + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade build + python -m pip install --upgrade twine + - name: AZ Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_SERVICE_PRINCIPAL }} # SETUP: replace AZURE_SERVICE_PRINCIPAL with your own secret name + - name: Install AZ ML and tools + run: | # SETUP line 34 to point to your own AML workspace + az extension add -n ml -y --version 2.2.1 + az configure --defaults group=njs-aia-rg workspace=njs-ws location=eastus + - name: Run Feature Engineering + uses: ./.github/actions/aml-job-create + with: + jobFile: src/workshop/core/data_engineering/feature_engineering.yml + diff --git a/MLOps-ADO-ADB/.gitignore b/MLOps-ADO-ADB/.gitignore index 5ab4839a..1808c064 100644 --- a/MLOps-ADO-ADB/.gitignore +++ b/MLOps-ADO-ADB/.gitignore @@ -1,136 +1,136 @@ -# -src/workshop/data/*.parquet -src/workshop/data/*.joblib -*.amlignore -*.amltmp -*.ipynb_aml_checkpoints - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker +# +src/workshop/data/*.parquet +src/workshop/data/*.joblib +*.amlignore +*.amltmp +*.ipynb_aml_checkpoints + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker .pyre/ \ No newline at end of file diff --git a/MLOps-ADO-ADB/CODE_OF_CONDUCT.md b/MLOps-ADO-ADB/CODE_OF_CONDUCT.md index f9ba8cf6..c72a5749 100644 --- a/MLOps-ADO-ADB/CODE_OF_CONDUCT.md +++ b/MLOps-ADO-ADB/CODE_OF_CONDUCT.md @@ -1,9 +1,9 @@ -# Microsoft Open Source Code of Conduct - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). - -Resources: - -- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) -- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) -- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns +# Microsoft Open Source Code of Conduct + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). + +Resources: + +- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) +- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns diff --git a/MLOps-ADO-ADB/LICENSE.txt b/MLOps-ADO-ADB/LICENSE.txt index f48a22b0..42097ef5 100644 --- a/MLOps-ADO-ADB/LICENSE.txt +++ b/MLOps-ADO-ADB/LICENSE.txt @@ -1,21 +1,21 @@ -MIT License - -Copyright (c) 2023 Nick Switanek - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +MIT License + +Copyright (c) 2023 Nick Switanek + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MLOps-ADO-ADB/README.md b/MLOps-ADO-ADB/README.md index 3bc3c0ef..3d8eb8c9 100644 --- a/MLOps-ADO-ADB/README.md +++ b/MLOps-ADO-ADB/README.md @@ -1,29 +1,29 @@ -[![Board Status](https://dev.azure.com/mlops-field/c4a73005-3da3-411a-806b-e3fc770a2d0f/b4d02123-12a2-46bc-b717-3862eac0b33f/_apis/work/boardbadge/c154e447-6da6-4e54-9da2-880c03bd8e89)](https://dev.azure.com/mlops-field/c4a73005-3da3-411a-806b-e3fc770a2d0f/_boards/board/t/b4d02123-12a2-46bc-b717-3862eac0b33f/Microsoft.RequirementCategory) -# MLOps with Azure DevOps and Azure Databricks -MLOps-ado-adb is a repo created by Microsoft field personnel (GBB, CSA, MTC) that provides a template to facilitate an introductory workshop on modern MLOps practices, using Azure DevOps for CI/CD pipelines and Azure Databricks for ML asset development and compute. This repo is modeled after the [work](https://github.com/microsoft/MLOpsTemplate/) of Microsoft's West Region CSU, which instead uses GitHub Actions and Azure Machine Learning. - -Here is the link to the workshop materials: -- [MLOps workshop materials](/src/workshop/) - - -## Contributing - -This project welcomes contributions and suggestions. Most contributions require you to agree to a -Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us -the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. - -When you submit a pull request, a CLA bot will automatically determine whether you need to provide -a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions -provided by the bot. You will only need to do this once across all repos using our CLA. - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). -For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or -contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. - -## Trademarks - -This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft -trademarks or logos is subject to and must follow -[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). -Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. -Any use of third-party trademarks or logos are subject to those third-party's policies. +[![Board Status](https://dev.azure.com/mlops-field/c4a73005-3da3-411a-806b-e3fc770a2d0f/b4d02123-12a2-46bc-b717-3862eac0b33f/_apis/work/boardbadge/c154e447-6da6-4e54-9da2-880c03bd8e89)](https://dev.azure.com/mlops-field/c4a73005-3da3-411a-806b-e3fc770a2d0f/_boards/board/t/b4d02123-12a2-46bc-b717-3862eac0b33f/Microsoft.RequirementCategory) +# MLOps with Azure DevOps and Azure Databricks +MLOps-ado-adb is a repo created by Microsoft field personnel (GBB, CSA, MTC) that provides a template to facilitate an introductory workshop on modern MLOps practices, using Azure DevOps for CI/CD pipelines and Azure Databricks for ML asset development and compute. This repo is modeled after the [work](https://github.com/microsoft/MLOpsTemplate/) of Microsoft's West Region CSU, which instead uses GitHub Actions and Azure Machine Learning. + +Here is the link to the workshop materials: +- [MLOps workshop materials](/src/workshop/) + + +## Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + +## Trademarks + +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow +[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). +Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. +Any use of third-party trademarks or logos are subject to those third-party's policies. diff --git a/MLOps-ADO-ADB/SECURITY.md b/MLOps-ADO-ADB/SECURITY.md index f7b89984..12fbd833 100644 --- a/MLOps-ADO-ADB/SECURITY.md +++ b/MLOps-ADO-ADB/SECURITY.md @@ -1,41 +1,41 @@ - - -## Security - -Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). - -If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. - -## Reporting Security Issues - -**Please do not report security vulnerabilities through public GitHub issues.** - -Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). - -If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). - -You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). - -Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: - - * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) - * Full paths of source file(s) related to the manifestation of the issue - * The location of the affected source code (tag/branch/commit or direct URL) - * Any special configuration required to reproduce the issue - * Step-by-step instructions to reproduce the issue - * Proof-of-concept or exploit code (if possible) - * Impact of the issue, including how an attacker might exploit the issue - -This information will help us triage your report more quickly. - -If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. - -## Preferred Languages - -We prefer all communications to be in English. - -## Policy - -Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). - + + +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** + +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). + +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). + \ No newline at end of file diff --git a/MLOps-ADO-ADB/SUPPORT.md b/MLOps-ADO-ADB/SUPPORT.md index 8b05616f..dc72f0e5 100644 --- a/MLOps-ADO-ADB/SUPPORT.md +++ b/MLOps-ADO-ADB/SUPPORT.md @@ -1,25 +1,25 @@ -# TODO: The maintainer of this repo has not yet edited this file - -**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? - -- **No CSS support:** Fill out this template with information about how to file issues and get help. -- **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). -- **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. - -*Then remove this first heading from this SUPPORT.MD file before publishing your repo.* - -# Support - -## How to file issues and get help - -This project uses GitHub Issues to track bugs and feature requests. Please search the existing -issues before filing new issues to avoid duplicates. For new issues, file your bug or -feature request as a new Issue. - -For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE -FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER -CHANNEL. WHERE WILL YOU HELP PEOPLE?**. - -## Microsoft Support Policy - -Support for this **PROJECT or PRODUCT** is limited to the resources listed above. +# TODO: The maintainer of this repo has not yet edited this file + +**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? + +- **No CSS support:** Fill out this template with information about how to file issues and get help. +- **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). +- **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. + +*Then remove this first heading from this SUPPORT.MD file before publishing your repo.* + +# Support + +## How to file issues and get help + +This project uses GitHub Issues to track bugs and feature requests. Please search the existing +issues before filing new issues to avoid duplicates. For new issues, file your bug or +feature request as a new Issue. + +For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE +FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER +CHANNEL. WHERE WILL YOU HELP PEOPLE?**. + +## Microsoft Support Policy + +Support for this **PROJECT or PRODUCT** is limited to the resources listed above. diff --git a/MLOps-ADO-ADB/src/workshop/README.md b/MLOps-ADO-ADB/src/workshop/README.md index 1cc3f26b..664691f9 100644 --- a/MLOps-ADO-ADB/src/workshop/README.md +++ b/MLOps-ADO-ADB/src/workshop/README.md @@ -1,48 +1,48 @@ -# MLOps Workshop - -## Introduction -The MLOps workshop is an instructor-led workshop that provides guidance on an MLOps -implementation in Azure. MLOps is a pattern of practices rather than a technology, and there are various ways of implementing MLOps on Azure. This workshop leverages [Azure Databricks](https://learn.microsoft.com/en-us/azure/databricks/introduction/) -and [Azure DevOps](https://learn.microsoft.com/en-us/azure/devops/user-guide/what-is-azure-devops?view=azure-devops) -to implement a robust set of workflows to support machine learning models in production. For a workshop using Azure Machine Learning and GitHub Actions, see a similar set of materials [here](https://github.com/microsoft/MLOpsTemplate/). - -The core capability deployed in this scenario is a prediction of wine quality using a set of empirical measures. This is based on a [UCI Dataset](https://archive.ics.uci.edu/dataset/186/wine+quality). This is treated as a classification scenario, which occurs frequently for many enterprises. For the purpose of this workshop, the key stages of exploring the data, -engineering predictive features (data engineering) and model building (training, hyperparameter tuning, -algorithm selection, etc.) will be assumed to be done and already codified in this [Databricks -notebook](https://learn.microsoft.com/en-us/azure/databricks/mlflow/end-to-end-example). -The core focus of the workshop will then be how to refactor this notebook for easier maintenance and iterative development, lay the DevOps foundations for the ML lifecycle, for continuous delivery of the best predictive capabilities in production even as data science team members experiment with new techniques to improve model performance. - -## Audience -- Data scientists -- ML engineers -- ML platform architects and managers -- ... and any other roles that require hands-on experience to support ML models in Azure - -## Goals -- Understand key elements of modern MLOps and how it helps improve and accelerate ML practices. -- Design experiments and MLOps pipelines in Azure Databricks. -- Get hands-on experience in building continuous integration and continuous deployment pipelines with Azure DevOps. - - -Now, head to [Workshop Environment Setup: Part 0](documents/part_0.md) - - -## Contributing -This project welcomes contributions and suggestions. Most contributions require you to agree to a -Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us -the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. - -When you submit a pull request, a CLA bot will automatically determine whether you need to provide -a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions -provided by the bot. You will only need to do this once across all repos using our CLA. - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). -For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or -contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. - -## Trademarks -This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft -trademarks or logos is subject to and must follow -[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). -Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. -Any use of third-party trademarks or logos are subject to those third-party's policies. +# MLOps Workshop + +## Introduction +The MLOps workshop is an instructor-led workshop that provides guidance on an MLOps +implementation in Azure. MLOps is a pattern of practices rather than a technology, and there are various ways of implementing MLOps on Azure. This workshop leverages [Azure Databricks](https://learn.microsoft.com/en-us/azure/databricks/introduction/) +and [Azure DevOps](https://learn.microsoft.com/en-us/azure/devops/user-guide/what-is-azure-devops?view=azure-devops) +to implement a robust set of workflows to support machine learning models in production. For a workshop using Azure Machine Learning and GitHub Actions, see a similar set of materials [here](https://github.com/microsoft/MLOpsTemplate/). + +The core capability deployed in this scenario is a prediction of wine quality using a set of empirical measures. This is based on a [UCI Dataset](https://archive.ics.uci.edu/dataset/186/wine+quality). This is treated as a classification scenario, which occurs frequently for many enterprises. For the purpose of this workshop, the key stages of exploring the data, +engineering predictive features (data engineering) and model building (training, hyperparameter tuning, +algorithm selection, etc.) will be assumed to be done and already codified in this [Databricks +notebook](https://learn.microsoft.com/en-us/azure/databricks/mlflow/end-to-end-example). +The core focus of the workshop will then be how to refactor this notebook for easier maintenance and iterative development, lay the DevOps foundations for the ML lifecycle, for continuous delivery of the best predictive capabilities in production even as data science team members experiment with new techniques to improve model performance. + +## Audience +- Data scientists +- ML engineers +- ML platform architects and managers +- ... and any other roles that require hands-on experience to support ML models in Azure + +## Goals +- Understand key elements of modern MLOps and how it helps improve and accelerate ML practices. +- Design experiments and MLOps pipelines in Azure Databricks. +- Get hands-on experience in building continuous integration and continuous deployment pipelines with Azure DevOps. + + +Now, head to [Workshop Environment Setup: Part 0](documents/part_0.md) + + +## Contributing +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + +## Trademarks +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow +[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). +Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. +Any use of third-party trademarks or logos are subject to those third-party's policies. diff --git a/MLOps-ADO-ADB/src/workshop/documents/images/deploy-to-azure.svg b/MLOps-ADO-ADB/src/workshop/documents/images/deploy-to-azure.svg index 61ec2669..7eeab675 100644 --- a/MLOps-ADO-ADB/src/workshop/documents/images/deploy-to-azure.svg +++ b/MLOps-ADO-ADB/src/workshop/documents/images/deploy-to-azure.svg @@ -1,67 +1,67 @@ - - - - - - image/svg+xml - - - - - - - - - - - + + + + + + image/svg+xml + + + + + + + + + + + diff --git a/MLOps-ADO-ADB/src/workshop/documents/part_0.md b/MLOps-ADO-ADB/src/workshop/documents/part_0.md index 6beaee78..e2d8460d 100644 --- a/MLOps-ADO-ADB/src/workshop/documents/part_0.md +++ b/MLOps-ADO-ADB/src/workshop/documents/part_0.md @@ -1,154 +1,154 @@ -# Part 0: Workshop Environment Setup -> NOTE: The Workshop is designed to take place in a customer environment and requires an Azure AD Service Principal, including the Azure AD token for the Service Principal. Many data science and ML platform teams will need to submit a service request for a Service Principal. Plan in enough time for this service request to be processed. - -Read the Workshop scenario overview [here](../README.md#workshop-scenario). - -The steps described here in Part 0 prepare Azure Databricks, Azure DevOps, and an Azure AD Service Principal to serve as the MLOps platform. These steps are to be performed by the platform administrators so that data scientists can start with Part 1 without getting overwhelmed with the infrastructure details involved in getting the core pieces of the MLOps platform linked together. - -In general, expect the platform setup steps in Part 0 to take as long or longer than the workshop steps in Parts 1-5. You may need multiple, multi-hour sessions to complete Part 0, especially if you run into needs for access rights that only admins can grant. - -It is preferable to use a resource group and set of resources that are isolated from other work, so that the data and model assets that workshop participants create don't litter your work environment with workshop assets, and so that you can tear down the workshop environment upon completion. - - -## Pre-requisites for Part 0 -- An Azure Account and Subscription -- Permission to create, or access to, an Azure AD Service Principal -- An understanding of: - - Azure Subscriptions and Resource Groups - - Azure AD Service Principals - - Git mechanics (in this workshop we use Azure Repos and Databricks Repos) - -## Steps - -1. Create a Service Principal in Azure Active Directory -2. Add the Service Principal to your Azure Databricks workspace -3. Add the Service Principal to Azure DevOps -4. Create a variable group in Azure DevOps -5. Register Azure Pipelines -6. Grant workshop participants Azure DevOps permissions and user access -7. Set up branch protection policies in Azure Repo -8. Generate and store data -9. Confirm participant access to a Databricks cluster with ML runtime -10. Generate baseline model and grant Model manage permissions to team -11. Confirm `main` and `integration` branches of repo are up to date -12. Dry run the workshop to double check environment setup - -## 1. Create a Service Principal in Azure Active Directory - -> NOTE: You can skip this section if you've been provided an Azure AD Service Principal by an Admin. - - -## 2. Add the Service Principal to your Azure Databricks workspace - -![Databricks Admin Console > Add service principal](images/part_0_adb_add_sp.png) - -## 3. Add the Service Principal to Azure DevOps - -![Azure DevOps > Project Settings > Teams > Add service principal](images/part_0_ado_add_sp.png) - - -## 4. Create a variable group in Azure DevOps - -![Alt text](images/image-9.png) - -You may either grant open access to pipelines, or just to the three pipelines that you register. -![Alt text](images/image-10.png) - - -### 4.1 Choose ADO "utility" user and create PAT (Personal Access Token) - -You are going to create a PAT for some "utility" user in the Azure DevOps project to allow your code access the Azure Repo in the Azure DevOps project. (There is a current Databricks limitation in directly granting the Service Principal git credentials with any git provider.) - -Use this user for `ado_username` and `ado_username_pat`. - -An alternative approach here may be to use a service connection, which you may use as a substitute if you are able. - -## 5. Register Azure Pipelines -![Alt text](images/image-11.png) - -![Alt text](images/image-12.png) - -![Alt text](images/image-13.png) - -![Alt text](images/image-14.png) - -Go to the Pipelines section and select "New pipeline". - -![Pipelines view in Azure DevOps](images/part_2_ado_pipe1.png) - -Select your MLOps-ado-adb repo. - -![Pipelines select repo step](images/part_0_ado_pipe2.png) - -Configure your pipeline using an "Existing Azure Pipelines YAML file": -![Pipelines configure step](images/part_0_ado_pipe3.png) - -Select the `.azure_pipelines/workshop_unit_test.yml` Azure Pipelines YAML file in your branch of the repo, (not in the main branch). - -![Pipelines select yaml step](images/part_0_ado_pipe4.png) - -Give your pipeline a Pipeline Name of "Data Prep Unit Test Pipeline", the click the "Save and run" button to manually trigger the pipeline. -![Pipelines review step](images/part_0_ado_pipe5.png) - - -Select `/.azure_pipelines/workshop_unit_test.yml`. - -Save and rename to "Data Prep Unit Test Pipeline." - -Select `/.azure_pipelines/ci.yml`. - -Save and rename to "Continuous Integration Pipeline." - -Select `/.azure_pipelines/cd.yml`. - -Save and rename to "Continuous Delivery Pipeline." - -## 6. Grant workshop participants Azure DevOps permissions and user access -Be sure that workshop participants can access the Azure Repo. You may need to grant participants Basic User Access rights, in addition to Contributor permissions, so they can see and use the Azure Repo. - -Make sure participants also are able to create new branches in the repo, as they do this in Part 1. - -## 7. Set up branch protection policies in Azure Repo -### 7.1 Integration branch policies -Require approval of merges to `integration`, which triggers the CI pipeline, permitting requestor to approve their own changes. -![Integration branch policies](images/part_0_integration_policies.png) - -### 7.2 Main branch policies -Require approval from two people for merges to `main`, which triggers the CD pipeline, and prohibit the pusher from approving their own changes. -![Main branch policies](images/part_0_main_policies.png) - -## 8. Generate and store data - -In Databricks, navigate to your Databricks Repo and to the notebook `/src/workshop/notebooks/part_0_create_datasets` and run it. - -![Alt text](images/image-15.png) - -![Alt text](images/image-16.png) - -## 9. Confirm participant access to a Databricks cluster with ML runtime -Workshop participants will interactively run notebooks that depend on `mlflow` being installed: make sure participants have access to an ML runtime cluster. - -## 10. Generate baseline model and grant Model manage permissions to team -In Databricks, navigate to the repo and run the following three notebooks in sequence: - -1. `/src/workshop/notebooks/part_1_1_data_prep.ipynb` -2. `/src/workshop/notebooks/part_1_2_training.ipynb` -3. `/src/workshop/notebooks/part_1_3_evaluating.ipynb` - -Doing so will create a model named "wine_quality". -Navigate to the Models section of Databricks and change the permissions on the "wine_quality" model so that all users can Manage. - -![Set wine_quality model permissions](images/part_0_set_model_permissions.png) - - -## 11. Confirm `main` and `integration` branches of repo are up to date -If you made any changes to the files so they run in your environment as you wish, be sure to commit those to version control and make sure both the `main` and `integration` branches of the repo reflect your most recent changes. - -## 12. Dry run the workshop to double check environment setup -To ensure you have set up the environment correctly, have someone with only the same permissions as workshop participants do a dry run of the workshop content in Parts 1 through 5. This will take a few hours for one person, but if it uncovers problems that need resolving, resolving it now before you have a room or teams call full of workshop participants will save time and frustration for the large group. - -## Congratulations -You have set up the environment to enable workshop participants to execute the steps described in Parts 1 through 5. - - -## [Go to Part 1](part_1.md) +# Part 0: Workshop Environment Setup +> NOTE: The Workshop is designed to take place in a customer environment and requires an Azure AD Service Principal, including the Azure AD token for the Service Principal. Many data science and ML platform teams will need to submit a service request for a Service Principal. Plan in enough time for this service request to be processed. + +Read the Workshop scenario overview [here](../README.md#workshop-scenario). + +The steps described here in Part 0 prepare Azure Databricks, Azure DevOps, and an Azure AD Service Principal to serve as the MLOps platform. These steps are to be performed by the platform administrators so that data scientists can start with Part 1 without getting overwhelmed with the infrastructure details involved in getting the core pieces of the MLOps platform linked together. + +In general, expect the platform setup steps in Part 0 to take as long or longer than the workshop steps in Parts 1-5. You may need multiple, multi-hour sessions to complete Part 0, especially if you run into needs for access rights that only admins can grant. + +It is preferable to use a resource group and set of resources that are isolated from other work, so that the data and model assets that workshop participants create don't litter your work environment with workshop assets, and so that you can tear down the workshop environment upon completion. + + +## Pre-requisites for Part 0 +- An Azure Account and Subscription +- Permission to create, or access to, an Azure AD Service Principal +- An understanding of: + - Azure Subscriptions and Resource Groups + - Azure AD Service Principals + - Git mechanics (in this workshop we use Azure Repos and Databricks Repos) + +## Steps + +1. Create a Service Principal in Azure Active Directory +2. Add the Service Principal to your Azure Databricks workspace +3. Add the Service Principal to Azure DevOps +4. Create a variable group in Azure DevOps +5. Register Azure Pipelines +6. Grant workshop participants Azure DevOps permissions and user access +7. Set up branch protection policies in Azure Repo +8. Generate and store data +9. Confirm participant access to a Databricks cluster with ML runtime +10. Generate baseline model and grant Model manage permissions to team +11. Confirm `main` and `integration` branches of repo are up to date +12. Dry run the workshop to double check environment setup + +## 1. Create a Service Principal in Azure Active Directory + +> NOTE: You can skip this section if you've been provided an Azure AD Service Principal by an Admin. + + +## 2. Add the Service Principal to your Azure Databricks workspace + +![Databricks Admin Console > Add service principal](images/part_0_adb_add_sp.png) + +## 3. Add the Service Principal to Azure DevOps + +![Azure DevOps > Project Settings > Teams > Add service principal](images/part_0_ado_add_sp.png) + + +## 4. Create a variable group in Azure DevOps + +![Alt text](images/image-9.png) + +You may either grant open access to pipelines, or just to the three pipelines that you register. +![Alt text](images/image-10.png) + + +### 4.1 Choose ADO "utility" user and create PAT (Personal Access Token) + +You are going to create a PAT for some "utility" user in the Azure DevOps project to allow your code access the Azure Repo in the Azure DevOps project. (There is a current Databricks limitation in directly granting the Service Principal git credentials with any git provider.) + +Use this user for `ado_username` and `ado_username_pat`. + +An alternative approach here may be to use a service connection, which you may use as a substitute if you are able. + +## 5. Register Azure Pipelines +![Alt text](images/image-11.png) + +![Alt text](images/image-12.png) + +![Alt text](images/image-13.png) + +![Alt text](images/image-14.png) + +Go to the Pipelines section and select "New pipeline". + +![Pipelines view in Azure DevOps](images/part_2_ado_pipe1.png) + +Select your MLOps-ado-adb repo. + +![Pipelines select repo step](images/part_0_ado_pipe2.png) + +Configure your pipeline using an "Existing Azure Pipelines YAML file": +![Pipelines configure step](images/part_0_ado_pipe3.png) + +Select the `.azure_pipelines/workshop_unit_test.yml` Azure Pipelines YAML file in your branch of the repo, (not in the main branch). + +![Pipelines select yaml step](images/part_0_ado_pipe4.png) + +Give your pipeline a Pipeline Name of "Data Prep Unit Test Pipeline", the click the "Save and run" button to manually trigger the pipeline. +![Pipelines review step](images/part_0_ado_pipe5.png) + + +Select `/.azure_pipelines/workshop_unit_test.yml`. + +Save and rename to "Data Prep Unit Test Pipeline." + +Select `/.azure_pipelines/ci.yml`. + +Save and rename to "Continuous Integration Pipeline." + +Select `/.azure_pipelines/cd.yml`. + +Save and rename to "Continuous Delivery Pipeline." + +## 6. Grant workshop participants Azure DevOps permissions and user access +Be sure that workshop participants can access the Azure Repo. You may need to grant participants Basic User Access rights, in addition to Contributor permissions, so they can see and use the Azure Repo. + +Make sure participants also are able to create new branches in the repo, as they do this in Part 1. + +## 7. Set up branch protection policies in Azure Repo +### 7.1 Integration branch policies +Require approval of merges to `integration`, which triggers the CI pipeline, permitting requestor to approve their own changes. +![Integration branch policies](images/part_0_integration_policies.png) + +### 7.2 Main branch policies +Require approval from two people for merges to `main`, which triggers the CD pipeline, and prohibit the pusher from approving their own changes. +![Main branch policies](images/part_0_main_policies.png) + +## 8. Generate and store data + +In Databricks, navigate to your Databricks Repo and to the notebook `/src/workshop/notebooks/part_0_create_datasets` and run it. + +![Alt text](images/image-15.png) + +![Alt text](images/image-16.png) + +## 9. Confirm participant access to a Databricks cluster with ML runtime +Workshop participants will interactively run notebooks that depend on `mlflow` being installed: make sure participants have access to an ML runtime cluster. + +## 10. Generate baseline model and grant Model manage permissions to team +In Databricks, navigate to the repo and run the following three notebooks in sequence: + +1. `/src/workshop/notebooks/part_1_1_data_prep.ipynb` +2. `/src/workshop/notebooks/part_1_2_training.ipynb` +3. `/src/workshop/notebooks/part_1_3_evaluating.ipynb` + +Doing so will create a model named "wine_quality". +Navigate to the Models section of Databricks and change the permissions on the "wine_quality" model so that all users can Manage. + +![Set wine_quality model permissions](images/part_0_set_model_permissions.png) + + +## 11. Confirm `main` and `integration` branches of repo are up to date +If you made any changes to the files so they run in your environment as you wish, be sure to commit those to version control and make sure both the `main` and `integration` branches of the repo reflect your most recent changes. + +## 12. Dry run the workshop to double check environment setup +To ensure you have set up the environment correctly, have someone with only the same permissions as workshop participants do a dry run of the workshop content in Parts 1 through 5. This will take a few hours for one person, but if it uncovers problems that need resolving, resolving it now before you have a room or teams call full of workshop participants will save time and frustration for the large group. + +## Congratulations +You have set up the environment to enable workshop participants to execute the steps described in Parts 1 through 5. + + +## [Go to Part 1](part_1.md) diff --git a/MLOps-ADO-ADB/src/workshop/documents/part_1.md b/MLOps-ADO-ADB/src/workshop/documents/part_1.md index 9a71961e..5590a1eb 100644 --- a/MLOps-ADO-ADB/src/workshop/documents/part_1.md +++ b/MLOps-ADO-ADB/src/workshop/documents/part_1.md @@ -1,119 +1,119 @@ - -# Part 1: Structure code for fast iterative development -## Pre-requisites -- Complete [Part 0](part_0.md) to set up the required resources and permissions in Azure. - - -## Summary -Your team has been working on a new ML problem. The team has done initial exploratory work preparing the data and fitting models and has now come to a state that the solution direction is mostly solidified. Now it is time to structure the work so that workflow that produces a deployed model is hardened and made maintainable, while also enabling the teamto iterate on it, systematically and quickly, to continue to improve on the deployable solution.   - -So far team members have been working mostly independently in Azure Databricks notebooks that handle their end-to-end model development workflow. To enable more effective collaboration for continuous improvement and easier maintenance of the workflow, they will break the workflow into separately maintainable but linked parts. - -As a first step towards MLOps, the team needs to do the following:  - -- Modularize: A single, end-to-end Databricks notebook is refactored into a linked sequence of smaller, "module" notebooks that each focus on a particular stage in the overall model development lifecycle. Modular notebooks can be more easily developed and tested independently and in parallel by multiple members. -- Parameterize: The modular notebooks are parameterized so that they be rerun with different parameter values to exhibit different behavior. We can thus use parameters to, for example, load only ten days of data during unit testing, setting parameter `days=10`, while six months of data during model training that happens during integration testing in the CI pipeline, `days=180`. - -To illustrate how the process works, the monolithic notebook was refactored into a feature engineering notebook, a model training notebook, and an evaluation notebook. You will run these modules individually to see how they work. - - ![monolithic to modular](./images/monolithic_modular.png) - -## Steps - -> Note: You can review notebooks and run the following tasks in the Databricks Repo in your Azure Databricks workspace. - -0. Navigate to `Repos/{your Databricks user account}/MLOps-ado-adb/src/workshop/notebooks`. Depending on your version of the Databricks UI, Repos may either be found within your Workspace tab in the sidebar, or instead be found in a separate tab. - -![Databricks Repo file explorer](images/part_1_db_repo_file_explorer.png) - - -1. Familiarize yourself with the steps in the - notebook in the Databricks Repo at `/notebooks/mlflow-end-to-end-example.ipynb`. The notebook, available in the [Azure Databricks documentation](https://learn.microsoft.com/en-us/azure/databricks/mlflow/end-to-end-example), shows the end-to-end data preparation and model building workflow in a single notebook. - - **There is no need to run this end-to-end notebook as part of this workshop.** - -2. Ask yourself, and discuss with your team, why putting the entire workflow into a single notebook is a challenge to scalable and repeatable ML development. - -Now observe how the monolithic notebook was refactored into a data prep or feature engineering module, a model training module, and a model evaluation module so that each step in the overall process can be developed and run independently. One thing to note about the new modules is that they contain both the code from the monolithic notebook, as well as a definition of the interface, the inputs and outputs needed and produced by the code, so that the modules can be linked together to replicate the entire workflow represented in the monolithic notebook. Additionally, modules can be parameterized so that they behave differently depending on what parameters are passed to them, either during interactive testing, or later on in this workshop, by pipelines that run at different stages in the MLOps lifecycle. - -3. The basic version control and git branching strategy we'll use is as follows: -- The `main` branch contains all the code used to develop the model in production -- The `integration` branch starts as a complete copy of `main` -- Data scientists and data engineers create development or feature branches off of `integration` with names like `dev-{yourname}` to experiment with changes to some part of the workflow, in the hopes of finding an improvement in the models produced by the workflow -- If results are promising, the work done in `dev-{yourname}` is merged into `integration` -- If the new work results in a model that outperforms the production model in `main`, then the new code in `integration` becomes the new `main`, and the model is updated to reflect the new ML model training workflow. - - -4. In your Databricks Repo, create your own development branch off of the `integration` branch where you can make and track changes. This branch will be your development area to create and test new code or pipelines before committing or merging the code back into a common branch, such as `integration`. - -To do this, right-click the `/MLOps-ado-adb` folder in your Databricks Repos section of your Workspace, and select the "Git..." option from the drop-down menu. - -![Git options from Databricks Repo](images/part_1_git_options_from_adb_repo.png) - -In the next screen, make sure the `integration` branch is selected from the drop-down menu. -![Databricks Repo branch UI with integration default](images/part_1_branch_ui_integration.png) - -Select "Create Branch." In the next screen, type `dev-{yourname}` in the "Branch name" field and "Create" the branch based on "Branch: integration". -![Databricks Repo create dev branch based on integration branch](images/part_1_adb_create_branch.png) - -After you've created the branch, close the branch window and confirm that `dev-{yourname}` appears in the filepath at the top of the Repos view in Azure Databrcks: -![Databricks Repo file explorer with dev branch selected](images/part_1_adb_file_exp_dev.png) - -While your dev branch is selected, you'll be looking at version-controlled copies of the files from the integration branch. Any changes you make to the files while on your branch will only be reflected in your branch, and not in other branches. - -Next let's review those task-focused notebooks that were refactored from the end-to-end monolithic notebook. - -5. Review the refactored data preparation logic in the notebook at `/notebooks/part_1_1_data_prep.ipynb`. - -This modular notebook focused on data prep does the following: - -- Loads the raw data from dbfs. -- Checks for missing values. -- Does some basic data visualizations. -- Creates a new, binary outcome variable. -- Saves the prepared data to dbfs. - -At the top of the notebook, you'll see comments indicating where to change some parameters regarding the path to where you write the prepared dataset. We do this in the context of the workshop so you aren't overwriting the datasets created by other workshop participants. - -Run this notebook. - -6. Review the refactored model training logic in the `/notebooks/part_1_2_training.ipynb` notebook. - -This modular notebook focused on model training does the following: - -- Loads the data prepared in the data prep notebook. -- Splits the data for training and validation. -- Builds a baseline model and -- Registers the model to the Model Registry and labels it as "Staging" - -Change the parameters at the top of the notebook to read the dataset from where you write it in the data prep notebook, and to establish a mlflow model path specific to you. - -Run this notebook. - -7. Review the refactored model evaluation logic in the `notebooks/part_1_3_evaluating.ipynb` notebook. - -This modular notebook focused on model evaluation does the following: - -- Load the test data. -- Load the model registered to staging in the training step. -- Use the trained model to predict on the test data and generate model evaluation metrics. -- If no prior trained model exists, the model will be registered as a baseline model in production. -- If a production model is found, the evaluation metrics for that model will be compared against metrics for the newly trained model and if the new model's metrics surpass those of the current production model, the new model will be registered to production. If not, the notebook exits and raises an exception. - -Change the parameters at the top of the evaluation notebook to read the model from the path in the mlflow model registry where you saved it in the training notebook. - -Run this notebook. - -8. Navigate to the Models section of Azure Databricks to see that a model is produced with name `wine_quality_{yourname}` and labeled as Production. This will be your baseline model that your future iterative development of parts of the ML workflow will aim to beat. - -![Databricks Registered Models View](images/part_1_model_registry.png) - -## Success criteria -- Data prep notebook runs and writes prepared data to dbfs. -- Model training notebook creates and registers a model. -- Model evaluation notebook either promotes a model to production or exits. - - -## [Go to Part 2](part_2.md) - - + +# Part 1: Structure code for fast iterative development +## Pre-requisites +- Complete [Part 0](part_0.md) to set up the required resources and permissions in Azure. + + +## Summary +Your team has been working on a new ML problem. The team has done initial exploratory work preparing the data and fitting models and has now come to a state that the solution direction is mostly solidified. Now it is time to structure the work so that workflow that produces a deployed model is hardened and made maintainable, while also enabling the teamto iterate on it, systematically and quickly, to continue to improve on the deployable solution.   + +So far team members have been working mostly independently in Azure Databricks notebooks that handle their end-to-end model development workflow. To enable more effective collaboration for continuous improvement and easier maintenance of the workflow, they will break the workflow into separately maintainable but linked parts. + +As a first step towards MLOps, the team needs to do the following:  + +- Modularize: A single, end-to-end Databricks notebook is refactored into a linked sequence of smaller, "module" notebooks that each focus on a particular stage in the overall model development lifecycle. Modular notebooks can be more easily developed and tested independently and in parallel by multiple members. +- Parameterize: The modular notebooks are parameterized so that they be rerun with different parameter values to exhibit different behavior. We can thus use parameters to, for example, load only ten days of data during unit testing, setting parameter `days=10`, while six months of data during model training that happens during integration testing in the CI pipeline, `days=180`. + +To illustrate how the process works, the monolithic notebook was refactored into a feature engineering notebook, a model training notebook, and an evaluation notebook. You will run these modules individually to see how they work. + + ![monolithic to modular](./images/monolithic_modular.png) + +## Steps + +> Note: You can review notebooks and run the following tasks in the Databricks Repo in your Azure Databricks workspace. + +0. Navigate to `Repos/{your Databricks user account}/MLOps-ado-adb/src/workshop/notebooks`. Depending on your version of the Databricks UI, Repos may either be found within your Workspace tab in the sidebar, or instead be found in a separate tab. + +![Databricks Repo file explorer](images/part_1_db_repo_file_explorer.png) + + +1. Familiarize yourself with the steps in the + notebook in the Databricks Repo at `/notebooks/mlflow-end-to-end-example.ipynb`. The notebook, available in the [Azure Databricks documentation](https://learn.microsoft.com/en-us/azure/databricks/mlflow/end-to-end-example), shows the end-to-end data preparation and model building workflow in a single notebook. + + **There is no need to run this end-to-end notebook as part of this workshop.** + +2. Ask yourself, and discuss with your team, why putting the entire workflow into a single notebook is a challenge to scalable and repeatable ML development. + +Now observe how the monolithic notebook was refactored into a data prep or feature engineering module, a model training module, and a model evaluation module so that each step in the overall process can be developed and run independently. One thing to note about the new modules is that they contain both the code from the monolithic notebook, as well as a definition of the interface, the inputs and outputs needed and produced by the code, so that the modules can be linked together to replicate the entire workflow represented in the monolithic notebook. Additionally, modules can be parameterized so that they behave differently depending on what parameters are passed to them, either during interactive testing, or later on in this workshop, by pipelines that run at different stages in the MLOps lifecycle. + +3. The basic version control and git branching strategy we'll use is as follows: +- The `main` branch contains all the code used to develop the model in production +- The `integration` branch starts as a complete copy of `main` +- Data scientists and data engineers create development or feature branches off of `integration` with names like `dev-{yourname}` to experiment with changes to some part of the workflow, in the hopes of finding an improvement in the models produced by the workflow +- If results are promising, the work done in `dev-{yourname}` is merged into `integration` +- If the new work results in a model that outperforms the production model in `main`, then the new code in `integration` becomes the new `main`, and the model is updated to reflect the new ML model training workflow. + + +4. In your Databricks Repo, create your own development branch off of the `integration` branch where you can make and track changes. This branch will be your development area to create and test new code or pipelines before committing or merging the code back into a common branch, such as `integration`. + +To do this, right-click the `/MLOps-ado-adb` folder in your Databricks Repos section of your Workspace, and select the "Git..." option from the drop-down menu. + +![Git options from Databricks Repo](images/part_1_git_options_from_adb_repo.png) + +In the next screen, make sure the `integration` branch is selected from the drop-down menu. +![Databricks Repo branch UI with integration default](images/part_1_branch_ui_integration.png) + +Select "Create Branch." In the next screen, type `dev-{yourname}` in the "Branch name" field and "Create" the branch based on "Branch: integration". +![Databricks Repo create dev branch based on integration branch](images/part_1_adb_create_branch.png) + +After you've created the branch, close the branch window and confirm that `dev-{yourname}` appears in the filepath at the top of the Repos view in Azure Databrcks: +![Databricks Repo file explorer with dev branch selected](images/part_1_adb_file_exp_dev.png) + +While your dev branch is selected, you'll be looking at version-controlled copies of the files from the integration branch. Any changes you make to the files while on your branch will only be reflected in your branch, and not in other branches. + +Next let's review those task-focused notebooks that were refactored from the end-to-end monolithic notebook. + +5. Review the refactored data preparation logic in the notebook at `/notebooks/part_1_1_data_prep.ipynb`. + +This modular notebook focused on data prep does the following: + +- Loads the raw data from dbfs. +- Checks for missing values. +- Does some basic data visualizations. +- Creates a new, binary outcome variable. +- Saves the prepared data to dbfs. + +At the top of the notebook, you'll see comments indicating where to change some parameters regarding the path to where you write the prepared dataset. We do this in the context of the workshop so you aren't overwriting the datasets created by other workshop participants. + +Run this notebook. + +6. Review the refactored model training logic in the `/notebooks/part_1_2_training.ipynb` notebook. + +This modular notebook focused on model training does the following: + +- Loads the data prepared in the data prep notebook. +- Splits the data for training and validation. +- Builds a baseline model and +- Registers the model to the Model Registry and labels it as "Staging" + +Change the parameters at the top of the notebook to read the dataset from where you write it in the data prep notebook, and to establish a mlflow model path specific to you. + +Run this notebook. + +7. Review the refactored model evaluation logic in the `notebooks/part_1_3_evaluating.ipynb` notebook. + +This modular notebook focused on model evaluation does the following: + +- Load the test data. +- Load the model registered to staging in the training step. +- Use the trained model to predict on the test data and generate model evaluation metrics. +- If no prior trained model exists, the model will be registered as a baseline model in production. +- If a production model is found, the evaluation metrics for that model will be compared against metrics for the newly trained model and if the new model's metrics surpass those of the current production model, the new model will be registered to production. If not, the notebook exits and raises an exception. + +Change the parameters at the top of the evaluation notebook to read the model from the path in the mlflow model registry where you saved it in the training notebook. + +Run this notebook. + +8. Navigate to the Models section of Azure Databricks to see that a model is produced with name `wine_quality_{yourname}` and labeled as Production. This will be your baseline model that your future iterative development of parts of the ML workflow will aim to beat. + +![Databricks Registered Models View](images/part_1_model_registry.png) + +## Success criteria +- Data prep notebook runs and writes prepared data to dbfs. +- Model training notebook creates and registers a model. +- Model evaluation notebook either promotes a model to production or exits. + + +## [Go to Part 2](part_2.md) + + diff --git a/MLOps-ADO-ADB/src/workshop/documents/part_2.md b/MLOps-ADO-ADB/src/workshop/documents/part_2.md index a8183b4f..23bbf994 100644 --- a/MLOps-ADO-ADB/src/workshop/documents/part_2.md +++ b/MLOps-ADO-ADB/src/workshop/documents/part_2.md @@ -1,77 +1,77 @@ - -# Part 2: Preparing notebooks for remote triggering - -## Pre-requisites -- Complete [Part 0](part_0.md), [Part 1](part_1.md) -- In your Databricks Repo, have a personal dev branch that you created off of the `integration` branch, named `dev-{yourname}` or similar (for example, `dev-nick`). -- Run each notebook successfully via the Databricks notebook UI -- data prep, training, and evaluating. -- Confirm that you have a Model labeled "Production" in the Models section of Databricks. - -## Summary -After successfully restructuring the end-to-end Databricks notebook into task-focused, modular notebooks, and running those notebooks via the Databricks UI, your team wants to prepare to run the notebooks automatically in response to code changes. - -To do this, we need to move away from untracked, user-driven notebooks to version-controlled notebooks that can be run not by a person but by a Service Principal, which is a type of application with narrowly constrained rights and responsibilities. Your MLOps platform administrator should already have created a Service Principal, granted it the appropriate permissions to interact with your Databricks service, and given you permissions to use the Service Principal to run notebooks and workflows in Databricks. - -To introduce these concepts you will next do the following: -- Preview how to run the data prep notebook via REST API -- Preview how to use the Service Principal to run the notebook -- Review the configuration of an Azure Pipeline in Azure DevOps to run a "unit test" notebook using the Service Principal -- Manually trigger the Azure Pipeline from Azure DevOps - - -## Steps -1. Many actions you take in Databricks, including running a notebook, can be triggered programmatically via API. Let's unpack an example call to the API to run a notebook: -![Sample Databricks API to run job](images/part_2_run_job.png) - -In the first line we see `curl -X POST`, which means we're using a command-line utility, curl, to issue a request to a URL address. -At the bottom we see where the request is sent, to `$(databricks_workspace_uri)/api/2.1/jobs/run-now`. The `$(databricks_workspace_uri)` part is a variable referring to the URI of your Databricks instance, which corresponds to what you can find in the address bar of your browser and is of the form "https://{some string}.azuredatabricks.net/". - -Many variables used in this pipeline were specified in Part 0 during the platform setup, and they are in the Azure DevOps library as a variable group. Those can include secured secrets, including those in a linked Azure Key Vault. - - -After that is the `/api/2.1/jobs/run-now`, which is how we express the command to run the notebook. - -2. Next, in order to run a notebook in Databricks you need to have the right permissions. This is what the `Authorization: Bearer '"$(token)"'` is about. Unless we pass the right token along with the API request, the request will be rejected and no action will be taken. From the `$(token)` notation, we see that the token is a variable. How do we get that token? Prior to making the Databricks API request, we're going to request the token for the Service Principal from the Azure AD API. - -![Azure AD Login API](images/part_2_aad_login.png) - -With this REST call, we are asking Azure AD for an OAuth token for the Service Principal, referred to by its `client_id`. We are passing the `client_secret`, as a variable, for the Service Principal along with the request, and if the request is authorized, we'll get back a JSON message that will include the `access_token` as one of the elements of the JSON, which we use `jq` to parse and extract to the variable `token`. That `token` is what is referenced by `$(token)` in the call to the Databricks API. For security purposes, this token only lives for an hour, so each time we call the API, we'll first have the Service Principal authenticate. - -3. So we now know that running the notebook via the Databricks API requires first authenticating and getting a token that reflects the right permissions. We'd like these two steps to run on a secure machine in a pipeline that can be automated in response to certain events like code changes that have been saved and committed to our repo. Azure Pipelines is a platform enabling just such functionality. - -Here is an example Azure Pipeline definition: - -![Azure Pipeline to run Databricks Notebook](images/part_2_azpipe_run_nb.png) - -In the `steps` section we can see the authentication request (1) followed by the Databricks API request to run a notebook (2). - -These `curl` requests have to be run somewhere, and in the `pool` section we see that Azure Pipelines will run them on a virtual machine running the latest Linux ubuntu OS. - -At the top of the pipeline configuration there is a `trigger` section, which is where we'll specify the conditions under which the steps should be executed. We'll configure the trigger section in Part 3 of the workshop. - -4. For now let's manually trigger the Azure Pipeline to confirm it does what we expect. In your browser, navigate to your Azure DevOps project at https://dev.azure.com and go to the Pipelines section of the sidebar. - -You should see a pipeline named "Data Prep Unit Test Pipeline." Click on the pipeline name to see a list of prior runs of the pipeline, along with a blue button to "Run pipeline". Use the blue "Run pipeline" button to manually trigger the pipeline. - -Be sure to run the pipeline on your dev branch of the repo: - -![Azure Pipeline Job](images/part_2_ado_manual_trigger.png) - -5. Let's review what happens next. Click on the "Job" link to open up the Azure Pipeline job you just saved and ran. -![Azure Pipeline Job](images/part_2_pipe_job.png) - -You'll see a long list of steps that have run on the Linux VM in Azure pipelines. All steps in the Azure Pipeline should show a green checkmark as having successfully completed. Many of the steps are utility steps, but they also include the two steps we explicitly defined in the pipeline YAML, namely the authorization step, labeled "Get Entra ID token" -- Entra is the new brand name for Azure AD --, and the run notebook step, labeled "Run Databricks notebook via API". Click on "Run Databricks notebook via API". -![Azure Pipeline Job - Databricks API step](images/part_2_pipe_adb_step.png) - -6. Finally, let's confirm that the command we issued to our Databricks workspace from the Azure Pipeline actually triggered the Databricks notebook to run. In your browser, return to Azure Databricks and navigate to Workflows > Job runs. You should see a job with name "Data Prep Pipeline Run - {your branch name}" that was run as your Service Principal. - - -## Success criteria -- Basic understanding of how to call the Databricks API to run a notebook. -- Basic understanding that the Service Principal can execute the API call, if it is authenticated and has the right permissions. -- Basic understanding of how an Azure Pipeline can be configured to automate the sequence of steps for Service Principal authentication followed by Databricks notebook run using the Databricks API. -- Review the Azure Pipeline run. -- Confirm that the Azure Pipeline ran a notebook job in Azure Databricks. - - -## [Go to Part 3](part_3.md) + +# Part 2: Preparing notebooks for remote triggering + +## Pre-requisites +- Complete [Part 0](part_0.md), [Part 1](part_1.md) +- In your Databricks Repo, have a personal dev branch that you created off of the `integration` branch, named `dev-{yourname}` or similar (for example, `dev-nick`). +- Run each notebook successfully via the Databricks notebook UI -- data prep, training, and evaluating. +- Confirm that you have a Model labeled "Production" in the Models section of Databricks. + +## Summary +After successfully restructuring the end-to-end Databricks notebook into task-focused, modular notebooks, and running those notebooks via the Databricks UI, your team wants to prepare to run the notebooks automatically in response to code changes. + +To do this, we need to move away from untracked, user-driven notebooks to version-controlled notebooks that can be run not by a person but by a Service Principal, which is a type of application with narrowly constrained rights and responsibilities. Your MLOps platform administrator should already have created a Service Principal, granted it the appropriate permissions to interact with your Databricks service, and given you permissions to use the Service Principal to run notebooks and workflows in Databricks. + +To introduce these concepts you will next do the following: +- Preview how to run the data prep notebook via REST API +- Preview how to use the Service Principal to run the notebook +- Review the configuration of an Azure Pipeline in Azure DevOps to run a "unit test" notebook using the Service Principal +- Manually trigger the Azure Pipeline from Azure DevOps + + +## Steps +1. Many actions you take in Databricks, including running a notebook, can be triggered programmatically via API. Let's unpack an example call to the API to run a notebook: +![Sample Databricks API to run job](images/part_2_run_job.png) + +In the first line we see `curl -X POST`, which means we're using a command-line utility, curl, to issue a request to a URL address. +At the bottom we see where the request is sent, to `$(databricks_workspace_uri)/api/2.1/jobs/run-now`. The `$(databricks_workspace_uri)` part is a variable referring to the URI of your Databricks instance, which corresponds to what you can find in the address bar of your browser and is of the form "https://{some string}.azuredatabricks.net/". + +Many variables used in this pipeline were specified in Part 0 during the platform setup, and they are in the Azure DevOps library as a variable group. Those can include secured secrets, including those in a linked Azure Key Vault. + + +After that is the `/api/2.1/jobs/run-now`, which is how we express the command to run the notebook. + +2. Next, in order to run a notebook in Databricks you need to have the right permissions. This is what the `Authorization: Bearer '"$(token)"'` is about. Unless we pass the right token along with the API request, the request will be rejected and no action will be taken. From the `$(token)` notation, we see that the token is a variable. How do we get that token? Prior to making the Databricks API request, we're going to request the token for the Service Principal from the Azure AD API. + +![Azure AD Login API](images/part_2_aad_login.png) + +With this REST call, we are asking Azure AD for an OAuth token for the Service Principal, referred to by its `client_id`. We are passing the `client_secret`, as a variable, for the Service Principal along with the request, and if the request is authorized, we'll get back a JSON message that will include the `access_token` as one of the elements of the JSON, which we use `jq` to parse and extract to the variable `token`. That `token` is what is referenced by `$(token)` in the call to the Databricks API. For security purposes, this token only lives for an hour, so each time we call the API, we'll first have the Service Principal authenticate. + +3. So we now know that running the notebook via the Databricks API requires first authenticating and getting a token that reflects the right permissions. We'd like these two steps to run on a secure machine in a pipeline that can be automated in response to certain events like code changes that have been saved and committed to our repo. Azure Pipelines is a platform enabling just such functionality. + +Here is an example Azure Pipeline definition: + +![Azure Pipeline to run Databricks Notebook](images/part_2_azpipe_run_nb.png) + +In the `steps` section we can see the authentication request (1) followed by the Databricks API request to run a notebook (2). + +These `curl` requests have to be run somewhere, and in the `pool` section we see that Azure Pipelines will run them on a virtual machine running the latest Linux ubuntu OS. + +At the top of the pipeline configuration there is a `trigger` section, which is where we'll specify the conditions under which the steps should be executed. We'll configure the trigger section in Part 3 of the workshop. + +4. For now let's manually trigger the Azure Pipeline to confirm it does what we expect. In your browser, navigate to your Azure DevOps project at https://dev.azure.com and go to the Pipelines section of the sidebar. + +You should see a pipeline named "Data Prep Unit Test Pipeline." Click on the pipeline name to see a list of prior runs of the pipeline, along with a blue button to "Run pipeline". Use the blue "Run pipeline" button to manually trigger the pipeline. + +Be sure to run the pipeline on your dev branch of the repo: + +![Azure Pipeline Job](images/part_2_ado_manual_trigger.png) + +5. Let's review what happens next. Click on the "Job" link to open up the Azure Pipeline job you just saved and ran. +![Azure Pipeline Job](images/part_2_pipe_job.png) + +You'll see a long list of steps that have run on the Linux VM in Azure pipelines. All steps in the Azure Pipeline should show a green checkmark as having successfully completed. Many of the steps are utility steps, but they also include the two steps we explicitly defined in the pipeline YAML, namely the authorization step, labeled "Get Entra ID token" -- Entra is the new brand name for Azure AD --, and the run notebook step, labeled "Run Databricks notebook via API". Click on "Run Databricks notebook via API". +![Azure Pipeline Job - Databricks API step](images/part_2_pipe_adb_step.png) + +6. Finally, let's confirm that the command we issued to our Databricks workspace from the Azure Pipeline actually triggered the Databricks notebook to run. In your browser, return to Azure Databricks and navigate to Workflows > Job runs. You should see a job with name "Data Prep Pipeline Run - {your branch name}" that was run as your Service Principal. + + +## Success criteria +- Basic understanding of how to call the Databricks API to run a notebook. +- Basic understanding that the Service Principal can execute the API call, if it is authenticated and has the right permissions. +- Basic understanding of how an Azure Pipeline can be configured to automate the sequence of steps for Service Principal authentication followed by Databricks notebook run using the Databricks API. +- Review the Azure Pipeline run. +- Confirm that the Azure Pipeline ran a notebook job in Azure Databricks. + + +## [Go to Part 3](part_3.md) diff --git a/MLOps-ADO-ADB/src/workshop/documents/part_3.md b/MLOps-ADO-ADB/src/workshop/documents/part_3.md index 86bb6af9..4466b2c6 100644 --- a/MLOps-ADO-ADB/src/workshop/documents/part_3.md +++ b/MLOps-ADO-ADB/src/workshop/documents/part_3.md @@ -1,78 +1,78 @@ - -# Part 3: Use Azure DevOps with Databricks Repos for Automation - -## Pre-requisites -- Complete [Part 0](part_0.md), [Part 1](part_1.md), [Part 2](part_2.md) - -## Summary -Although there are a variety of ways to implement MLOps, the aim is to have a balance between a controlled, secured operational environment that delivers predictions or other services from ML models --this is the "Ops" in MLOps--, together with a way of integrating the innovations and improvements made by data scientists and engineers in the "ML" workflow. - -The code that generates the model that delivers predictions in production needs to be maintained no matter what, so any potential improvements that the data science and engineering teams make cannot disrupt the delivery of predictions. To prevent disruptions and to enable continuous delivery, it's common to require new ideas, committed to code, to pass through several stages of review before replacing parts of what was already in production. - -The basic stages that we'll introduce in this workshop are the following: -- "Unit test": Does a notebook that has been changed still run successfully on its own? -- Continuous Integration: If a notebook has changed, does the overall workflow, composed of several notebooks, still run successfully? -- Continuous Delivery: If the overall workflow runs successfully, it produces a model, so does that model outperform the model already in production? - -Because we want these stages to be required whenever someone proposes changing an element of the existing workflow, and because each stage has several steps that must be performed consistently every time, we make use of an automation tool, Azure Pipelines. - -We want certain steps to be taken every time changes are made to different parts of the code base, typically with different conditions in different branches. So we will configure the triggers of Azure Pipelines to watch for changes in the code repo. - -In this part, we focus on the "unit test." - -You may have noticed in Part 2 that the Azure Pipeline you worked with had "unit_test" in the YAML file name. We're putting "unit test" in quotes here, because it's not precisely what a software engineer would call a unit test, but the basic idea is similar: we want to assess whether a small but meaningful component of the code works as expected on its own, before we move on to trying to integrate it with everything else. - - -## Steps -1. First, let's review the Azure Pipeline YAML from Part 2 for what triggers the pipeline. In Azure DevOps, navigate to Azure Pipelines, click on the pipeline you triggered manually in Part 2, and click on the "Edit" button in the upper right. This will give you a way of viewing the pipeline YAML file. You can also navigate to this file in the Repos section of Azure DevOps, by clicking on Repos and into the `/.azure_pipelines` directory, then clicking on `workshop_unit_test.yml`. - -![Unit Test triggers config](images/part_3_unit_test_triggers.png) - - -We can see in the YAML configuring the Azure Pipeline that there are `branches` and `paths` triggers. The `branches` triggers state that any code changes committed to a branch that is not ("exclude") the `main` or `integration` branch and is on ("include") your personal dev branch, will trigger the pipeline. We only need the "include" branch trigger condition in this case, but when you are out of the workshop, you'll commonly encounter the condition to unit test every code change not on main or integration. - -Recall that `integration` starts as a copy of main, and that any new data science or engineering work should be done on a branch that is made off of `integration`. In Part 1, you created a development branch in your Databricks Repo named something like `dev-{yourname}`. Next, the `paths` triggers tell Azure Pipelines to observe changes to a particular file path, in this case the path to the data prep notebook. - -This trigger configuration then has Azure Pipelines watching your data prep notebook on branches like your development branch, and will run this automated set of tests when you commit the changes you make to your notebook, just to make sure that the notebook still runs successfully. -> Note: One could incorporate other tests at this stage, but for simplicity in demonstrating the basic concept, we focus here only on the notebook running successfully. - - -2. Next, now that we know which committed changes to which notebooks on which git branches should trigger this Azure pipeline, let's see if changes we make and commit to the data prep notebook actually do trigger the unit test pipeline. In Databricks, navigate to your Workspace > Repos and to the MLOps-ado-adb folder. Confirm that you are on your dev branch, likely named `dev-{yourname}`. - -In the Databricks Repo, open the `/notebooks/part_1_1_data_prep` Databricks notebook. Make any minor change to the notebook. This can be a change simply to the markdown comments, or something in the code cells that won't stop the notebook from running from top to bottom. You can manually `Run all` to confirm that your changes don't cause the notebook to raise an exception before completing. - -3. Changes you make in a Databricks notebook automatically save to the Workspace, but are not automatically reflected in the Azure Repo that the Databricks Repo is linked to. In order to sync the Databricks Repo with the Azure Repo, you need to execute two git actions, you need to `commit` the code changes and `push` the changes to the Azure repo. To do this from your Databricks Repo, you can click on the git icon with your branch name at the top of your Databricks notebook. In this example the branch is named `dev-{yourname}`: -![Databricks notebook link to repo](images/part_3_adb_repo_link_in_nb.png) - -Next, you are in the Databricks Repo interface, where you must first enter a "Commit message", and then you can select "Commit & Push" to commit the changes and push them to the Azure Repo. -![Databricks repo commit and push](images/part_3_adb_repo_commit_push.png) - -Next, let's trace out the consequences of your code commit and push. - -4. First, in Azure DevOps navigate to the Azure Repo and to the notebook file you changed to see that the changes you committed are reflected in the history of the file. - - -5. Next, the Azure Pipeline trigger conditions that we reviewed in step 1 seem to be satisfied by the changes we just committed to the Azure Repo: We changed the file on the watched path in a branch that is neither `main` nor `integration`. So did we trigger the Azure Pipeline? In Azure DevOps, navigate to your Pipelines. In the "Recently run pipelines" list, can you find the relevant pipeline, "Data Prep Unit Test Pipeline" and does a recent run correspond to your commit and push (try looking at the messages to find one that matches your commit message)? - -Importantly, did the pipeline successfully run to completion? There will be a solid green circle with a checkmark in it if it did. - -6. Finally, the Azure Pipeline uses a Service Principal with notebook run privileges on your Databricks workspace to run the Databricks notebook that you changed. Back in Azure Databricks, navigate to the Job Runs section to see whether the notebook you changed was run by the Service Principal, as expected. - -## Conclusion -The "unit test" described here is a basic introduction to how all the key pieces in an MLOps implementation fit together. - -- A data scientist or data engineer creates a dev branch off of `integration`, where they make changes to Databricks notebooks in the Databricks Repo that are designed to improve the overall flow in some way -- Once the changes made in the Databricks Repo are deemed ready for potential inclusion in the production code in `main`, the changes are committed and pushed to the Azure Repo in Azure DevOps -- An Azure Pipeline in Azure DevOps is configured to watch for file changes in certain branches, and an automated set of steps are triggered by the commit to the Azure Repo -- The Azure Pipeline authenticates the Azure Service Principal and enables the Service Principal to run the notebook via the Databricks API - -If the unit test Azure Pipeline runs successfully, then we expect the code changes in the one notebook are ready to be tested in the context of the full workflow consisting of multiple Databricks notebooks. Testing in the context of the full workflow is known as integration testing, and is the focus of Part 4. - -## Success criteria -- You made a (minor) change to the data prep Databricks notebook in your dev branch in your Databricks Repo -- You committed and pushed the changes from the Databricks Repo, and the changes were reflected in the Azure Repo -- Your commit to the Azure Repo triggered the relevant Azure Pipeline -- The pipeline triggered the Databricks notebook run -- The databricks notebook ran to completion successfully, providing evidence that whatever other effects the changes may have had, they haven't introduced changes that stop the notebook from running to completion - -## [Go to Part 4](part_4.md) + +# Part 3: Use Azure DevOps with Databricks Repos for Automation + +## Pre-requisites +- Complete [Part 0](part_0.md), [Part 1](part_1.md), [Part 2](part_2.md) + +## Summary +Although there are a variety of ways to implement MLOps, the aim is to have a balance between a controlled, secured operational environment that delivers predictions or other services from ML models --this is the "Ops" in MLOps--, together with a way of integrating the innovations and improvements made by data scientists and engineers in the "ML" workflow. + +The code that generates the model that delivers predictions in production needs to be maintained no matter what, so any potential improvements that the data science and engineering teams make cannot disrupt the delivery of predictions. To prevent disruptions and to enable continuous delivery, it's common to require new ideas, committed to code, to pass through several stages of review before replacing parts of what was already in production. + +The basic stages that we'll introduce in this workshop are the following: +- "Unit test": Does a notebook that has been changed still run successfully on its own? +- Continuous Integration: If a notebook has changed, does the overall workflow, composed of several notebooks, still run successfully? +- Continuous Delivery: If the overall workflow runs successfully, it produces a model, so does that model outperform the model already in production? + +Because we want these stages to be required whenever someone proposes changing an element of the existing workflow, and because each stage has several steps that must be performed consistently every time, we make use of an automation tool, Azure Pipelines. + +We want certain steps to be taken every time changes are made to different parts of the code base, typically with different conditions in different branches. So we will configure the triggers of Azure Pipelines to watch for changes in the code repo. + +In this part, we focus on the "unit test." + +You may have noticed in Part 2 that the Azure Pipeline you worked with had "unit_test" in the YAML file name. We're putting "unit test" in quotes here, because it's not precisely what a software engineer would call a unit test, but the basic idea is similar: we want to assess whether a small but meaningful component of the code works as expected on its own, before we move on to trying to integrate it with everything else. + + +## Steps +1. First, let's review the Azure Pipeline YAML from Part 2 for what triggers the pipeline. In Azure DevOps, navigate to Azure Pipelines, click on the pipeline you triggered manually in Part 2, and click on the "Edit" button in the upper right. This will give you a way of viewing the pipeline YAML file. You can also navigate to this file in the Repos section of Azure DevOps, by clicking on Repos and into the `/.azure_pipelines` directory, then clicking on `workshop_unit_test.yml`. + +![Unit Test triggers config](images/part_3_unit_test_triggers.png) + + +We can see in the YAML configuring the Azure Pipeline that there are `branches` and `paths` triggers. The `branches` triggers state that any code changes committed to a branch that is not ("exclude") the `main` or `integration` branch and is on ("include") your personal dev branch, will trigger the pipeline. We only need the "include" branch trigger condition in this case, but when you are out of the workshop, you'll commonly encounter the condition to unit test every code change not on main or integration. + +Recall that `integration` starts as a copy of main, and that any new data science or engineering work should be done on a branch that is made off of `integration`. In Part 1, you created a development branch in your Databricks Repo named something like `dev-{yourname}`. Next, the `paths` triggers tell Azure Pipelines to observe changes to a particular file path, in this case the path to the data prep notebook. + +This trigger configuration then has Azure Pipelines watching your data prep notebook on branches like your development branch, and will run this automated set of tests when you commit the changes you make to your notebook, just to make sure that the notebook still runs successfully. +> Note: One could incorporate other tests at this stage, but for simplicity in demonstrating the basic concept, we focus here only on the notebook running successfully. + + +2. Next, now that we know which committed changes to which notebooks on which git branches should trigger this Azure pipeline, let's see if changes we make and commit to the data prep notebook actually do trigger the unit test pipeline. In Databricks, navigate to your Workspace > Repos and to the MLOps-ado-adb folder. Confirm that you are on your dev branch, likely named `dev-{yourname}`. + +In the Databricks Repo, open the `/notebooks/part_1_1_data_prep` Databricks notebook. Make any minor change to the notebook. This can be a change simply to the markdown comments, or something in the code cells that won't stop the notebook from running from top to bottom. You can manually `Run all` to confirm that your changes don't cause the notebook to raise an exception before completing. + +3. Changes you make in a Databricks notebook automatically save to the Workspace, but are not automatically reflected in the Azure Repo that the Databricks Repo is linked to. In order to sync the Databricks Repo with the Azure Repo, you need to execute two git actions, you need to `commit` the code changes and `push` the changes to the Azure repo. To do this from your Databricks Repo, you can click on the git icon with your branch name at the top of your Databricks notebook. In this example the branch is named `dev-{yourname}`: +![Databricks notebook link to repo](images/part_3_adb_repo_link_in_nb.png) + +Next, you are in the Databricks Repo interface, where you must first enter a "Commit message", and then you can select "Commit & Push" to commit the changes and push them to the Azure Repo. +![Databricks repo commit and push](images/part_3_adb_repo_commit_push.png) + +Next, let's trace out the consequences of your code commit and push. + +4. First, in Azure DevOps navigate to the Azure Repo and to the notebook file you changed to see that the changes you committed are reflected in the history of the file. + + +5. Next, the Azure Pipeline trigger conditions that we reviewed in step 1 seem to be satisfied by the changes we just committed to the Azure Repo: We changed the file on the watched path in a branch that is neither `main` nor `integration`. So did we trigger the Azure Pipeline? In Azure DevOps, navigate to your Pipelines. In the "Recently run pipelines" list, can you find the relevant pipeline, "Data Prep Unit Test Pipeline" and does a recent run correspond to your commit and push (try looking at the messages to find one that matches your commit message)? + +Importantly, did the pipeline successfully run to completion? There will be a solid green circle with a checkmark in it if it did. + +6. Finally, the Azure Pipeline uses a Service Principal with notebook run privileges on your Databricks workspace to run the Databricks notebook that you changed. Back in Azure Databricks, navigate to the Job Runs section to see whether the notebook you changed was run by the Service Principal, as expected. + +## Conclusion +The "unit test" described here is a basic introduction to how all the key pieces in an MLOps implementation fit together. + +- A data scientist or data engineer creates a dev branch off of `integration`, where they make changes to Databricks notebooks in the Databricks Repo that are designed to improve the overall flow in some way +- Once the changes made in the Databricks Repo are deemed ready for potential inclusion in the production code in `main`, the changes are committed and pushed to the Azure Repo in Azure DevOps +- An Azure Pipeline in Azure DevOps is configured to watch for file changes in certain branches, and an automated set of steps are triggered by the commit to the Azure Repo +- The Azure Pipeline authenticates the Azure Service Principal and enables the Service Principal to run the notebook via the Databricks API + +If the unit test Azure Pipeline runs successfully, then we expect the code changes in the one notebook are ready to be tested in the context of the full workflow consisting of multiple Databricks notebooks. Testing in the context of the full workflow is known as integration testing, and is the focus of Part 4. + +## Success criteria +- You made a (minor) change to the data prep Databricks notebook in your dev branch in your Databricks Repo +- You committed and pushed the changes from the Databricks Repo, and the changes were reflected in the Azure Repo +- Your commit to the Azure Repo triggered the relevant Azure Pipeline +- The pipeline triggered the Databricks notebook run +- The databricks notebook ran to completion successfully, providing evidence that whatever other effects the changes may have had, they haven't introduced changes that stop the notebook from running to completion + +## [Go to Part 4](part_4.md) diff --git a/MLOps-ADO-ADB/src/workshop/documents/part_4.md b/MLOps-ADO-ADB/src/workshop/documents/part_4.md index e5e9be92..238ee4b3 100644 --- a/MLOps-ADO-ADB/src/workshop/documents/part_4.md +++ b/MLOps-ADO-ADB/src/workshop/documents/part_4.md @@ -1,141 +1,141 @@ -# Part 4: Continuous Integration (CI) - -## Pre-requisites -- Complete [Part 0](part_0.md), [Part 1](part_1.md), [Part 2](part_2.md), [Part 3](part_3.md) - -## Summary -After successfully using Azure Pipelines to automate unit testing of changes made to an individual notebook that represents a step in the overall model development workflow, your team wants to test that the changed notebook also integrates with the overall workflow. Continuous Integration (CI) is the process of developing, testing, integrating, and evaluating new features in a staging environment, as a way of confirming they are ready for deployment and release. - - -## Steps: -1. The notebook that was unit tested in Part 3 is also part of an overall end-to-end workflow. In Databricks you may have linked multiple notebook jobs together into a workflow using the UI. Here we will specify a `training_workflow` using code we commit to the Repo. - -We use the workflow to link the three modular notebooks into a sequence of steps that depend on the success of prior steps, and that the notebooks reflect the code committed to the `integration` branch of the Azure Repo. So if we want to test whether the new code in a notebook integrates with the other pieces of this workflow, the new code will need to make its way in `integration`. The workflow looks like this. - -![Databricks training workflow](images/part_4_adb_training_workflow.png) - - -2. The logic of integration testing is that the entire Databricks training workflow should be triggered if the new code has passed unit testing and been merged into the `integration` branch. Let's set the Azure Pipeline triggers to watch for an approved merge into `integration`. - -``` -# .azure_pipelines/ci.yml - -trigger: - branches: - exclude: - - main - include: - - integration - paths: - include: - - src/workshop/notebooks/part_1_1_data_prep.ipynb - - src/workshop/notebooks/part_1_2_training.ipynb - - src/workshop/notebooks/part_1_3_evaluating.ipynb - - .azure_pipelines/ci.yml - -# ...remainder of the Azure Pipeline config for CI -``` - -With the Azure Pipeline triggers thus configured, now if the unit test passed on notebook code change, then to get that code into the `integration` branch we need to execute a Pull PR to merge dev branch into integration. And if the PR is merged to the `integration` branch, the continuous integration (CI) Pipeline will automatically run the Databricks workflow job that includes an evaluation step. - -Note that we have included the json that specifies the Databricks model training workflow right in the CI pipeline configuration. - -``` -# .azure_pipelines/ci.yml - -# pipeline config code -# ... - -# databricks workflow config - "run_name": "Model Training Workflow - '"$(BRANCH_NAME)"'", - "tasks": [ - { - "task_key": "data_prep", - "notebook_task": { - "notebook_path": "src/workshop/notebooks/part_1_1_data_prep", - "source": "GIT", - "base_parameters": { - "run_name": "'"$(BRANCH_NAME)"'" - } - }, - "new_cluster": '"$cluster_def"' - }, - { - "task_key": "model_training", - "notebook_task": { - "notebook_path": "src/workshop/notebooks/part_1_2_training", - "source": "GIT", - "base_parameters": { - "run_name": "'"$(BRANCH_NAME)"'" - } - }, - "depends_on": [ {"task_key": "data_prep"} ], - "new_cluster": '"$cluster_def"' - }, - { - "task_key": "model_evaluation", - "notebook_task": { - "notebook_path": "src/workshop/notebooks/part_1_3_evaluating", - "source": "GIT", - "base_parameters": { - "run_name": "'"$(BRANCH_NAME)"'", - "devops_action": "Integration" - } - }, - "depends_on": [ {"task_key": "model_training"} ], - "new_cluster": '"$cluster_def"' - } - - ], - -# remainder of pipeline config file -# ... -``` - -3. Now that we have the CI pipeline configured, let's make a meaningful change to the training notebook. Remember from Part 1 that the evaluation notebook will exit if the model trained by the Databricks training workflow doesn't outperform the current model in production when compared on test data. If the evaluation notebook exits, then the workflow will not complete and the CI pipeline will fail, as is desired. So let's update the training notebook to include a technique likely to outperform the baseline model, namely hyperparameter search. - -Navigate to your Databricks Repo, to `/notebooks/part_1_2_training`. Make sure you are on the development branch you created in Part 1. In the `/notebooks` folder you'll find some sample code for hyperparameter search and model selection in `/notebooks/part_4_new_training_code`. Copy this code into the model training cell in the original `part_1_2_training`. - -4. Once you're sure the edited notebook will run, Commit and Push your changes from the Databricks Repo to Azure Repo, using the steps you practiced in Part 3. - -> Question: Will your commit and push trigger the unit test Azure Pipeline? Why or why not? Is this desirable or not? How could you improve the situation? - -5. Now you've pushed the code to the Azure Repo, but it's still on your dev branch. To trigger the CI pipeline, you need to make a pull request (PR) from your personal dev branch, which is named something like `dev-{yourname}`, to the `integration` branch. Pull Request management is not currently supported in Databricks Repos, so you'll need to go to the Azure Repo in Azure DevOps to make a PR. - - >Action Items: - >- Navigate to your Azure Repo in Azure DevOps - >- Click on the "Pull requests" tab and then click on "New pull request". - > - >- Specify that your pull request is to merge `yourname-dev` into the `integration` branch. - - ![Alt text](images/image-8.png) - - >- Give the PR a title and brief description, then click on "Create" pull request. - -6. Merge the pull request into integration to trigger CI pipeline. (Self-merge of PRs to integration allowed by branch protection rules) - -The merge to the integration branch triggers the workshop_ci workflow. Click on the Actions tab on your repository and you should see CI workflow running. Click and examine all the steps, note that the CI Workflow is running the steps in the `ci.yml`. - - The CI workflow has multiple steps, including logging in to Azure, accessing the relevant code from the remote git repository in Azure Repos, and specifying and running the Databricks training model pipeline. - - As a part of this workflow, the updated model from our current changes is compared to our best previous model and if it performs better it passes the evaluation step, which is reflected in the evaluation notebook. - - - -7. Confirm that Azure Pipelines CI pipeline triggers Databricks workflow job. - - -8. If workflow completes successfully, and the model evaluation step shows improvement, then the new model produced by the workflow involving the notebooks is registered and promoted to the "Staging" slot in the model registry. - - -9. If your new model training code passed the integration testing (and the unit testing before that), then it seems likely you've made an improvement to the model training workflow and the new workflow should be replace the old one reflected in the `main` branch. In Part 5, we'll establish a continuous deployment (CD) Azure Pipeline and set up appropriate triggers to automate and manage the promotion of the new workflow to the production, `main` branch. - - -## Success criteria -- Trigger CI workflow when a pull request is merged to the integration branch -- Successfully run the CI pipeline which also runs the Databricks end-to-end workflow -- Check in Databricks whether the new model performs well enough to potentially replace the current best model. - - -## [Go to Part 5](part_5.md) - +# Part 4: Continuous Integration (CI) + +## Pre-requisites +- Complete [Part 0](part_0.md), [Part 1](part_1.md), [Part 2](part_2.md), [Part 3](part_3.md) + +## Summary +After successfully using Azure Pipelines to automate unit testing of changes made to an individual notebook that represents a step in the overall model development workflow, your team wants to test that the changed notebook also integrates with the overall workflow. Continuous Integration (CI) is the process of developing, testing, integrating, and evaluating new features in a staging environment, as a way of confirming they are ready for deployment and release. + + +## Steps: +1. The notebook that was unit tested in Part 3 is also part of an overall end-to-end workflow. In Databricks you may have linked multiple notebook jobs together into a workflow using the UI. Here we will specify a `training_workflow` using code we commit to the Repo. + +We use the workflow to link the three modular notebooks into a sequence of steps that depend on the success of prior steps, and that the notebooks reflect the code committed to the `integration` branch of the Azure Repo. So if we want to test whether the new code in a notebook integrates with the other pieces of this workflow, the new code will need to make its way in `integration`. The workflow looks like this. + +![Databricks training workflow](images/part_4_adb_training_workflow.png) + + +2. The logic of integration testing is that the entire Databricks training workflow should be triggered if the new code has passed unit testing and been merged into the `integration` branch. Let's set the Azure Pipeline triggers to watch for an approved merge into `integration`. + +``` +# .azure_pipelines/ci.yml + +trigger: + branches: + exclude: + - main + include: + - integration + paths: + include: + - src/workshop/notebooks/part_1_1_data_prep.ipynb + - src/workshop/notebooks/part_1_2_training.ipynb + - src/workshop/notebooks/part_1_3_evaluating.ipynb + - .azure_pipelines/ci.yml + +# ...remainder of the Azure Pipeline config for CI +``` + +With the Azure Pipeline triggers thus configured, now if the unit test passed on notebook code change, then to get that code into the `integration` branch we need to execute a Pull PR to merge dev branch into integration. And if the PR is merged to the `integration` branch, the continuous integration (CI) Pipeline will automatically run the Databricks workflow job that includes an evaluation step. + +Note that we have included the json that specifies the Databricks model training workflow right in the CI pipeline configuration. + +``` +# .azure_pipelines/ci.yml + +# pipeline config code +# ... + +# databricks workflow config + "run_name": "Model Training Workflow - '"$(BRANCH_NAME)"'", + "tasks": [ + { + "task_key": "data_prep", + "notebook_task": { + "notebook_path": "src/workshop/notebooks/part_1_1_data_prep", + "source": "GIT", + "base_parameters": { + "run_name": "'"$(BRANCH_NAME)"'" + } + }, + "new_cluster": '"$cluster_def"' + }, + { + "task_key": "model_training", + "notebook_task": { + "notebook_path": "src/workshop/notebooks/part_1_2_training", + "source": "GIT", + "base_parameters": { + "run_name": "'"$(BRANCH_NAME)"'" + } + }, + "depends_on": [ {"task_key": "data_prep"} ], + "new_cluster": '"$cluster_def"' + }, + { + "task_key": "model_evaluation", + "notebook_task": { + "notebook_path": "src/workshop/notebooks/part_1_3_evaluating", + "source": "GIT", + "base_parameters": { + "run_name": "'"$(BRANCH_NAME)"'", + "devops_action": "Integration" + } + }, + "depends_on": [ {"task_key": "model_training"} ], + "new_cluster": '"$cluster_def"' + } + + ], + +# remainder of pipeline config file +# ... +``` + +3. Now that we have the CI pipeline configured, let's make a meaningful change to the training notebook. Remember from Part 1 that the evaluation notebook will exit if the model trained by the Databricks training workflow doesn't outperform the current model in production when compared on test data. If the evaluation notebook exits, then the workflow will not complete and the CI pipeline will fail, as is desired. So let's update the training notebook to include a technique likely to outperform the baseline model, namely hyperparameter search. + +Navigate to your Databricks Repo, to `/notebooks/part_1_2_training`. Make sure you are on the development branch you created in Part 1. In the `/notebooks` folder you'll find some sample code for hyperparameter search and model selection in `/notebooks/part_4_new_training_code`. Copy this code into the model training cell in the original `part_1_2_training`. + +4. Once you're sure the edited notebook will run, Commit and Push your changes from the Databricks Repo to Azure Repo, using the steps you practiced in Part 3. + +> Question: Will your commit and push trigger the unit test Azure Pipeline? Why or why not? Is this desirable or not? How could you improve the situation? + +5. Now you've pushed the code to the Azure Repo, but it's still on your dev branch. To trigger the CI pipeline, you need to make a pull request (PR) from your personal dev branch, which is named something like `dev-{yourname}`, to the `integration` branch. Pull Request management is not currently supported in Databricks Repos, so you'll need to go to the Azure Repo in Azure DevOps to make a PR. + + >Action Items: + >- Navigate to your Azure Repo in Azure DevOps + >- Click on the "Pull requests" tab and then click on "New pull request". + > + >- Specify that your pull request is to merge `yourname-dev` into the `integration` branch. + + ![Alt text](images/image-8.png) + + >- Give the PR a title and brief description, then click on "Create" pull request. + +6. Merge the pull request into integration to trigger CI pipeline. (Self-merge of PRs to integration allowed by branch protection rules) + +The merge to the integration branch triggers the workshop_ci workflow. Click on the Actions tab on your repository and you should see CI workflow running. Click and examine all the steps, note that the CI Workflow is running the steps in the `ci.yml`. + + The CI workflow has multiple steps, including logging in to Azure, accessing the relevant code from the remote git repository in Azure Repos, and specifying and running the Databricks training model pipeline. + + As a part of this workflow, the updated model from our current changes is compared to our best previous model and if it performs better it passes the evaluation step, which is reflected in the evaluation notebook. + + + +7. Confirm that Azure Pipelines CI pipeline triggers Databricks workflow job. + + +8. If workflow completes successfully, and the model evaluation step shows improvement, then the new model produced by the workflow involving the notebooks is registered and promoted to the "Staging" slot in the model registry. + + +9. If your new model training code passed the integration testing (and the unit testing before that), then it seems likely you've made an improvement to the model training workflow and the new workflow should be replace the old one reflected in the `main` branch. In Part 5, we'll establish a continuous deployment (CD) Azure Pipeline and set up appropriate triggers to automate and manage the promotion of the new workflow to the production, `main` branch. + + +## Success criteria +- Trigger CI workflow when a pull request is merged to the integration branch +- Successfully run the CI pipeline which also runs the Databricks end-to-end workflow +- Check in Databricks whether the new model performs well enough to potentially replace the current best model. + + +## [Go to Part 5](part_5.md) + diff --git a/MLOps-ADO-ADB/src/workshop/documents/part_5.md b/MLOps-ADO-ADB/src/workshop/documents/part_5.md index 65bcfadd..7c6df3fe 100644 --- a/MLOps-ADO-ADB/src/workshop/documents/part_5.md +++ b/MLOps-ADO-ADB/src/workshop/documents/part_5.md @@ -1,70 +1,70 @@ -# Part 5: Continuous Delivery (CD) - -## Pre-requisites -- Complete [Part 0](part_0.md), [Part 1](part_1.md), [Part 2](part_2.md), [Part 3](part_3.md) and [Part 4](part_4.md) - -## Summary - -After a successful run of the CI (continuous integration) pipeline, your team is looking to complete the process with a CD (continuous delivery, or continuous deployment) pipeline that will handle the deployment of the new, better-performing model while maintaining continuous delivery of the model to processes that depend on the model's availability, without introducing any downtime in production, also known as a "hot swap". - -The goal of this section is to get a fully functional CD pipeline running that will: - -1. Authenticates using a Service Principal to be able to leverage the Azure Databricks commands in your workflow. -2. Be automatically triggered based on a Pull Request (PR) that is approved to merge the new code that passes the integration tests in the `integration` branch into the `main` branch of the repo. -3. If the model performance metrics show improvement over the current production model on production data, then promote the new model to production and archive the old model. - -## Steps - -1. As you have done since Part 3, you define triggers as part of a Azure Pipelines workflow. The CD workflow is triggered when a pull request is created and the new code in `integration` is merged into the `main` branch. The PR to `main` is opened if the new code results in a model that outperforms the prior model on test data. The triggers for this workshop have already been defined in `.azure_pipelines/workflows/cd.yml`. - -The key elements of the trigger section are as follows: - -``` -# .azure_pipelines/workflows/cd.yml - -trigger: - branches: - exclude: - - integration - include: - - main - paths: - include: - - src/workshop/notebooks/part_1_1_data_prep.ipynb - - src/workshop/notebooks/part_1_2_training.ipynb - - src/workshop/notebooks/part_1_3_evaluating.ipynb - - .azure_pipelines/cd.yml - -``` - -2. The CD workflow relies on the Azure CLI to control the infrastructure and implement the automation of the model deployments. Therefore, we need to setup this workflow to login to Azure via a Service Principal to be able to leverage the Azure CLI. - - > Action Items: - > 1. Open up the `cd.yml` file in your Azure repo under `.azure_pipelines/`. - > 2. Update the 'creds: ${{ secrets...' section in this file to setup your secret name. Follow the instructions in this file annotated with #setup. - - > Note: Please refer to [Use the Azure login action with a service principal secret](https://docs.microsoft.com/en-us/azure/developer/github/connect-from-azure?tabs=azure-portal%2Cwindows#use-the-azure-login-action-with-a-service-principal-secret) to create the proper Azure Credentials if you haven't done so already (you should have already defined such secret to complete the CI part of the workshop, i.e. [Part 4](part_4.md)). - -3. In our scenario, a model is deployed to production when it occupies the "Production" model slot in the model registry. Our CD pipeline needs to ensure that the current best model is always available in the "Production" slot. The Azure Pipeline we specify for CD automates these deployments. - -Now let's configure the Azure Pipelines configuration file that controls the CD process located at `.azure_pipelines/cd.yml` - -> Action Item: ->- Edit `cd.yml` to setup your Azure resource group name and Azure ML workspace name which are being passed as parameters to a set of custom GitHub Actions. Look for #setup and follow the instructions in the file. - -> Action Items: -> 1. Commit your configuration changes and push them up to the Azure Repo in your own development branch. -> 2. Go to the Azure Pipelines UI, select the pipeline you configured in 'cd.yml', and trigger it to run now on your own branch. -> 3. Once triggered, click on it to open up the details and monitor its execution. - - -4. (optional) Test the new deployment using `/notebooks/part_1_4_scoring`. - -## Success criteria - -- The CD pipeline runs sucessfully each time a PR request to 'main' is merged. Please test this by creating your own PR to main. -- Check that the better new model is deployed to the Production slot in your model registry, the Models section of Azure Databricks. - - -## Congratulations! +# Part 5: Continuous Delivery (CD) + +## Pre-requisites +- Complete [Part 0](part_0.md), [Part 1](part_1.md), [Part 2](part_2.md), [Part 3](part_3.md) and [Part 4](part_4.md) + +## Summary + +After a successful run of the CI (continuous integration) pipeline, your team is looking to complete the process with a CD (continuous delivery, or continuous deployment) pipeline that will handle the deployment of the new, better-performing model while maintaining continuous delivery of the model to processes that depend on the model's availability, without introducing any downtime in production, also known as a "hot swap". + +The goal of this section is to get a fully functional CD pipeline running that will: + +1. Authenticates using a Service Principal to be able to leverage the Azure Databricks commands in your workflow. +2. Be automatically triggered based on a Pull Request (PR) that is approved to merge the new code that passes the integration tests in the `integration` branch into the `main` branch of the repo. +3. If the model performance metrics show improvement over the current production model on production data, then promote the new model to production and archive the old model. + +## Steps + +1. As you have done since Part 3, you define triggers as part of a Azure Pipelines workflow. The CD workflow is triggered when a pull request is created and the new code in `integration` is merged into the `main` branch. The PR to `main` is opened if the new code results in a model that outperforms the prior model on test data. The triggers for this workshop have already been defined in `.azure_pipelines/workflows/cd.yml`. + +The key elements of the trigger section are as follows: + +``` +# .azure_pipelines/workflows/cd.yml + +trigger: + branches: + exclude: + - integration + include: + - main + paths: + include: + - src/workshop/notebooks/part_1_1_data_prep.ipynb + - src/workshop/notebooks/part_1_2_training.ipynb + - src/workshop/notebooks/part_1_3_evaluating.ipynb + - .azure_pipelines/cd.yml + +``` + +2. The CD workflow relies on the Azure CLI to control the infrastructure and implement the automation of the model deployments. Therefore, we need to setup this workflow to login to Azure via a Service Principal to be able to leverage the Azure CLI. + + > Action Items: + > 1. Open up the `cd.yml` file in your Azure repo under `.azure_pipelines/`. + > 2. Update the 'creds: ${{ secrets...' section in this file to setup your secret name. Follow the instructions in this file annotated with #setup. + + > Note: Please refer to [Use the Azure login action with a service principal secret](https://docs.microsoft.com/en-us/azure/developer/github/connect-from-azure?tabs=azure-portal%2Cwindows#use-the-azure-login-action-with-a-service-principal-secret) to create the proper Azure Credentials if you haven't done so already (you should have already defined such secret to complete the CI part of the workshop, i.e. [Part 4](part_4.md)). + +3. In our scenario, a model is deployed to production when it occupies the "Production" model slot in the model registry. Our CD pipeline needs to ensure that the current best model is always available in the "Production" slot. The Azure Pipeline we specify for CD automates these deployments. + +Now let's configure the Azure Pipelines configuration file that controls the CD process located at `.azure_pipelines/cd.yml` + +> Action Item: +>- Edit `cd.yml` to setup your Azure resource group name and Azure ML workspace name which are being passed as parameters to a set of custom GitHub Actions. Look for #setup and follow the instructions in the file. + +> Action Items: +> 1. Commit your configuration changes and push them up to the Azure Repo in your own development branch. +> 2. Go to the Azure Pipelines UI, select the pipeline you configured in 'cd.yml', and trigger it to run now on your own branch. +> 3. Once triggered, click on it to open up the details and monitor its execution. + + +4. (optional) Test the new deployment using `/notebooks/part_1_4_scoring`. + +## Success criteria + +- The CD pipeline runs sucessfully each time a PR request to 'main' is merged. Please test this by creating your own PR to main. +- Check that the better new model is deployed to the Production slot in your model registry, the Models section of Azure Databricks. + + +## Congratulations! This completes this workshop. You have gained hands-on experience with many of the key concepts involved in MLOps using Azure Databricks and Azure DevOps. \ No newline at end of file diff --git a/MLOps-ADO-ADB/src/workshop/documents/part_tips.md b/MLOps-ADO-ADB/src/workshop/documents/part_tips.md index d6b6acff..5dc1c40a 100644 --- a/MLOps-ADO-ADB/src/workshop/documents/part_tips.md +++ b/MLOps-ADO-ADB/src/workshop/documents/part_tips.md @@ -1,31 +1,31 @@ -# Pre-Workshop Checklist -> Note: Review the following criteria to ensure you can complete the workshop. These are critical pieces of access to get right for a successful workshop experience. - -## Azure -1. Do you have an Azure account? - -2. Do you have a `Contributor` role for your Azure Subscription? - - If you don't, do you have a `Contributor` role for the Azure Resource Group? - > Note: If you don't, you can't run the workshop. - -3. Do you have a Service Principal? - - If you don't, do you know the Service Principal and it's information (client id, secret)? - - If you don't, can you ask your Cloud team to create the Service Principal for limited scope of a resource group? - > Note: If you don't, you can't run the workshop. - -4. Do you know who can help you to handle issues? - -5. Do you know a person from your Cloud infra/security team who can help you: - - Create Azure resources - - Grant permission - -6. Did you register 'Microsoft.MachineLearningServices' for your Azure subscription? -> Note: If you're not sure, go to the Azure Portal > Subscriptions > 'YourSubscription' > Resource providers' > Search 'Microsoft.MachineLearningServices' - -![ml_services](./images/arm100.png) - -## Github -1. Do you have a Github account? -> Note: If not, create a new account and follow the instructions in Part 0 of the workshop. - -# [Go to Part 0](./part_0.md) +# Pre-Workshop Checklist +> Note: Review the following criteria to ensure you can complete the workshop. These are critical pieces of access to get right for a successful workshop experience. + +## Azure +1. Do you have an Azure account? + +2. Do you have a `Contributor` role for your Azure Subscription? + - If you don't, do you have a `Contributor` role for the Azure Resource Group? + > Note: If you don't, you can't run the workshop. + +3. Do you have a Service Principal? + - If you don't, do you know the Service Principal and it's information (client id, secret)? + - If you don't, can you ask your Cloud team to create the Service Principal for limited scope of a resource group? + > Note: If you don't, you can't run the workshop. + +4. Do you know who can help you to handle issues? + +5. Do you know a person from your Cloud infra/security team who can help you: + - Create Azure resources + - Grant permission + +6. Did you register 'Microsoft.MachineLearningServices' for your Azure subscription? +> Note: If you're not sure, go to the Azure Portal > Subscriptions > 'YourSubscription' > Resource providers' > Search 'Microsoft.MachineLearningServices' + +![ml_services](./images/arm100.png) + +## Github +1. Do you have a Github account? +> Note: If not, create a new account and follow the instructions in Part 0 of the workshop. + +# [Go to Part 0](./part_0.md) diff --git a/MLOps-ADO-ADB/src/workshop/notebooks/mlflow-end-to-end-example.ipynb b/MLOps-ADO-ADB/src/workshop/notebooks/mlflow-end-to-end-example.ipynb index 62d8fa41..9a861bc9 100644 --- a/MLOps-ADO-ADB/src/workshop/notebooks/mlflow-end-to-end-example.ipynb +++ b/MLOps-ADO-ADB/src/workshop/notebooks/mlflow-end-to-end-example.ipynb @@ -1,1329 +1,1329 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f775ddb6-7daf-4f9f-99fa-7fc024351b47", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Training machine learning models on tabular data: an end-to-end example\n", - "\n", - "This tutorial covers the following steps:\n", - "- Import data from your local machine into the Databricks File System (DBFS)\n", - "- Visualize the data using Seaborn and matplotlib\n", - "- Run a parallel hyperparameter sweep to train machine learning models on the dataset\n", - "- Explore the results of the hyperparameter sweep with MLflow\n", - "- Register the best performing model in MLflow\n", - "- Apply the registered model to another dataset using a Spark UDF\n", - "- Set up model serving for low-latency requests\n", - "\n", - "In this example, you build a model to predict the quality of Portugese \"Vinho Verde\" wine based on the wine's physicochemical properties. \n", - "\n", - "The example uses a dataset from the UCI Machine Learning Repository, presented in [*Modeling wine preferences by data mining from physicochemical properties*](https://www.sciencedirect.com/science/article/pii/S0167923609001377?via%3Dihub) [Cortez et al., 2009].\n", - "\n", - "## Requirements\n", - "This notebook requires Databricks Runtime for Machine Learning. \n", - "If you are using Databricks Runtime 7.3 LTS ML or below, you must update the CloudPickle library. To do that, uncomment and run the `%pip install` command in Cmd 2." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "3eab982e-0862-4138-b473-53bfc84bcf48", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# This command is only required if you are using a cluster running DBR 7.3 LTS ML or below. \n", - "#%pip install --upgrade cloudpickle" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "366dccef-47e4-466b-af76-53e29ad99472", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Import data\n", - " \n", - "In this section, you download a dataset from the web and upload it to Databricks File System (DBFS).\n", - "\n", - "1. Navigate to https://archive.ics.uci.edu/dataset/186/wine+quality and download the dataset to your local machine. The download contains two .csv files, `winequality-red.csv` and `winequality-white.csv`.\n", - "\n", - "1. From this Databricks notebook, select **File > Upload data to DBFS...**, and drag these files to the drag-and-drop target to upload them to the Databricks File System (DBFS). \n", - "\n", - " **Note**: if you don't have the **File > Upload data to DBFS...** option, you can load the dataset from the Databricks example datasets. Uncomment and run the last two lines in the following cell.\n", - "\n", - "1. Click **Next**. Auto-generated code to load the data appears. Under **Access Files from Notebooks**, select the **pandas** tab. Click **Copy** to copy the example code, and then click **Done**. \n", - "\n", - "1. Create a new cell, then paste in the sample code. It will look similar to the code shown in the following cell. Make these changes:\n", - " - Pass `sep=';'` to `pd.read_csv`\n", - " - Change the variable names from `df1` and `df2` to `white_wine` and `red_wine`, as shown in the following cell." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "fff57c10-e07e-47f2-b231-7ae3556ce157", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# import pandas as pd\n", - "\n", - "# white_wine = pd.read_csv(\"/dbfs/FileStore/shared_uploads/nick.switanek@microsoft.com/winequality_red.csv\", sep=';')\n", - "# red_wine = pd.read_csv(\"/dbfs/FileStore/shared_uploads/nick.switanek@microsoft.com/winequality_white.csv\", sep=';')" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "a1e7323b-decf-43f0-ab1b-c3b5a5e0159e", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# If you have the File > Upload Data menu option, follow the instructions in the previous cell to upload the data from your local machine.\n", - "# The generated code, including the required edits described in the previous cell, is shown here for reference.\n", - "\n", - "import pandas as pd\n", - "\n", - "# In the following lines, replace with your username.\n", - "# white_wine = pd.read_csv(\"/dbfs/FileStore/shared_uploads//winequality_white.csv\", sep=';')\n", - "# red_wine = pd.read_csv(\"/dbfs/FileStore/shared_uploads//winequality_red.csv\", sep=';')\n", - "\n", - "# If you do not have the File > Upload Data menu option, uncomment and run these lines to load the dataset.\n", - "\n", - "white_wine = pd.read_csv(\"/dbfs/databricks-datasets/wine-quality/winequality-white.csv\", sep=\";\")\n", - "red_wine = pd.read_csv(\"/dbfs/databricks-datasets/wine-quality/winequality-red.csv\", sep=\";\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "08eaf617-f2fd-4ea1-a054-633bd470b7f6", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Merge the two DataFrames into a single dataset, with a new binary feature \"is_red\" that indicates whether the wine is red or white." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "09e989d5-949f-4bec-9122-060b995a2f11", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "red_wine['is_red'] = 1\n", - "white_wine['is_red'] = 0\n", - "\n", - "data = pd.concat([red_wine, white_wine], axis=0)\n", - "\n", - "# Remove spaces from column names\n", - "data.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "cf5ae58a-d4d4-4006-9a8d-bc7ae8131c12", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "data.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "4ce0fcc0-6131-4db8-8497-5e525b7252c3", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Visualize data\n", - "\n", - "Before training a model, explore the dataset using Seaborn and Matplotlib." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "31cf51f6-4bd1-4aa5-ac59-9deede4ccf86", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Plot a histogram of the dependent variable, quality." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "5f38e522-42e9-4673-8bfa-43a5fc28ee6d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import seaborn as sns\n", - "sns.distplot(data.quality, kde=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "cce08fec-03ea-4d8a-846e-b602ea950be3", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Looks like quality scores are normally distributed between 3 and 9. \n", - "\n", - "Define a wine as high quality if it has quality >= 7." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "501a185a-3125-4ac4-83c1-61d28b529eaf", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "high_quality = (data.quality >= 7).astype(int)\n", - "data.quality = high_quality" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f7b8d9dc-4408-4a84-9a5d-3d187960d9fe", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Box plots are useful in noticing correlations between features and a binary label." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "0ee658d2-fb77-451b-b119-f680eb8f345b", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "dims = (3, 4)\n", - "\n", - "f, axes = plt.subplots(dims[0], dims[1], figsize=(25, 15))\n", - "axis_i, axis_j = 0, 0\n", - "for col in data.columns:\n", - " if col == 'is_red' or col == 'quality':\n", - " continue # Box plots cannot be used on indicator variables\n", - " sns.boxplot(x=high_quality, y=data[col], ax=axes[axis_i, axis_j])\n", - " axis_j += 1\n", - " if axis_j == dims[1]:\n", - " axis_i += 1\n", - " axis_j = 0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "c379687d-03da-41aa-bdf5-953db5ec9534", - "showTitle": false, - "title": "" - } - }, - "source": [ - "In the above box plots, a few variables stand out as good univariate predictors of quality. \n", - "\n", - "- In the alcohol box plot, the median alcohol content of high quality wines is greater than even the 75th quantile of low quality wines. High alcohol content is correlated with quality.\n", - "- In the density box plot, low quality wines have a greater density than high quality wines. Density is inversely correlated with quality." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "27b6967c-5a25-4fe6-894c-4c07a63014d7", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Preprocess data\n", - "Prior to training a model, check for missing values and split the data into training and validation sets." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "601c2717-c8a5-4411-ba78-e033a9bcdd83", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "data.isna().any()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "aff3310f-e556-44b6-8f7d-e547682e7677", - "showTitle": false, - "title": "" - } - }, - "source": [ - "There are no missing values." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "379abb9a-bd5b-46b9-8975-ce91858eba39", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Prepare dataset for training baseline model\n", - "Split the input data into 3 sets:\n", - "- Train (60% of the dataset used to train the model)\n", - "- Validation (20% of the dataset used to tune the hyperparameters)\n", - "- Test (20% of the dataset used to report the true performance of the model on an unseen dataset)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "e70c1c07-0048-4bb8-a0c3-17a7895bb4b2", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "X = data.drop([\"quality\"], axis=1)\n", - "y = data.quality\n", - "\n", - "# Split out the training data\n", - "X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=123)\n", - "\n", - "# Split the remaining data equally into validation and test\n", - "X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "224d0ddf-3925-4849-bc20-564175c212e8", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Build a baseline model\n", - "This task seems well suited to a random forest classifier, since the output is binary and there may be interactions between multiple variables.\n", - "\n", - "The following code builds a simple classifier using scikit-learn. It uses MLflow to keep track of the model accuracy, and to save the model for later use." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "46af6002-8063-4ada-b7e5-6ce56c22f60a", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import mlflow\n", - "import mlflow.pyfunc\n", - "import mlflow.sklearn\n", - "import numpy as np\n", - "import sklearn\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.metrics import roc_auc_score\n", - "from mlflow.models.signature import infer_signature\n", - "from mlflow.utils.environment import _mlflow_conda_env\n", - "import cloudpickle\n", - "import time\n", - "\n", - "# The predict method of sklearn's RandomForestClassifier returns a binary classification (0 or 1). \n", - "# The following code creates a wrapper function, SklearnModelWrapper, that uses \n", - "# the predict_proba method to return the probability that the observation belongs to each class. \n", - "\n", - "class SklearnModelWrapper(mlflow.pyfunc.PythonModel):\n", - " def __init__(self, model):\n", - " self.model = model\n", - " \n", - " def predict(self, context, model_input):\n", - " return self.model.predict_proba(model_input)[:,1]\n", - "\n", - "# mlflow.start_run creates a new MLflow run to track the performance of this model. \n", - "# Within the context, you call mlflow.log_param to keep track of the parameters used, and\n", - "# mlflow.log_metric to record metrics like accuracy.\n", - "with mlflow.start_run(run_name='untuned_random_forest'):\n", - " n_estimators = 10\n", - " model = RandomForestClassifier(n_estimators=n_estimators, random_state=np.random.RandomState(123))\n", - " model.fit(X_train, y_train)\n", - "\n", - " # predict_proba returns [prob_negative, prob_positive], so slice the output with [:, 1]\n", - " predictions_test = model.predict_proba(X_test)[:,1]\n", - " auc_score = roc_auc_score(y_test, predictions_test)\n", - " mlflow.log_param('n_estimators', n_estimators)\n", - " # Use the area under the ROC curve as a metric.\n", - " mlflow.log_metric('auc', auc_score)\n", - " wrappedModel = SklearnModelWrapper(model)\n", - " # Log the model with a signature that defines the schema of the model's inputs and outputs. \n", - " # When the model is deployed, this signature will be used to validate inputs.\n", - " signature = infer_signature(X_train, wrappedModel.predict(None, X_train))\n", - " \n", - " # MLflow contains utilities to create a conda environment used to serve models.\n", - " # The necessary dependencies are added to a conda.yaml file which is logged along with the model.\n", - " conda_env = _mlflow_conda_env(\n", - " additional_conda_deps=None,\n", - " additional_pip_deps=[\"cloudpickle=={}\".format(cloudpickle.__version__), \"scikit-learn=={}\".format(sklearn.__version__)],\n", - " additional_conda_channels=None,\n", - " )\n", - " mlflow.pyfunc.log_model(\"random_forest_model\", python_model=wrappedModel, conda_env=conda_env, signature=signature)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "867b3e23-9147-4ca4-b2d2-f1103c471d53", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Examine the learned feature importances output by the model as a sanity-check." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "1e174abe-774b-461e-b390-212e11d1c68d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])\n", - "feature_importances.sort_values('importance', ascending=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "06a939b8-7232-464b-8370-2bc9e3a8c613", - "showTitle": false, - "title": "" - } - }, - "source": [ - "As illustrated by the boxplots shown previously, both alcohol and density are important in predicting quality." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f8981b9f-8833-4dc4-a1cb-b25f9f43a8c6", - "showTitle": false, - "title": "" - } - }, - "source": [ - "You logged the Area Under the ROC Curve (AUC) to MLflow. Click **Experiment** at the upper right to display the Experiment Runs sidebar. \n", - "\n", - "The model achieved an AUC of 0.854.\n", - "\n", - "A random classifier would have an AUC of 0.5, and higher AUC values are better. For more information, see [Receiver Operating Characteristic Curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "8d1df4d8-4b50-47d8-ad89-d3c80ce960ee", - "showTitle": false, - "title": "" - } - }, - "source": [ - "#### Register the model in MLflow Model Registry\n", - "\n", - "By registering this model in Model Registry, you can easily reference the model from anywhere within Databricks.\n", - "\n", - "The following section shows how to do this programmatically, but you can also register a model using the UI. See \"[Create or register a model using the UI](https://docs.microsoft.com/azure/databricks/applications/machine-learning/manage-model-lifecycle/index#create-or-register-a-model-using-the-ui)\"." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "9f7b7e14-b016-41c1-9852-aae0f6625e93", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "run_id = mlflow.search_runs(filter_string='tags.mlflow.runName = \"untuned_random_forest\"').iloc[0].run_id" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "4774edbf-f3fb-495c-a638-ec63f275f885", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# If you see the error \"PERMISSION_DENIED: User does not have any permission level assigned to the registered model\", \n", - "# the cause may be that a model already exists with the name \"wine_quality\". Try using a different name.\n", - "model_name = \"wine_quality\"\n", - "model_version = mlflow.register_model(f\"runs:/{run_id}/random_forest_model\", model_name)\n", - "\n", - "# Registering the model takes a few seconds, so add a small delay\n", - "time.sleep(15)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "46f16d91-13ad-4e28-866f-3a979eb6fde1", - "showTitle": false, - "title": "" - } - }, - "source": [ - "You should now see the model in the Models page. To display the Models page, click the Models icon in the left sidebar. \n", - "\n", - "Next, transition this model to production and load it into this notebook from Model Registry." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "cb032bd6-7e9d-4430-8c13-895ebc189297", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from mlflow.tracking import MlflowClient\n", - "\n", - "client = MlflowClient()\n", - "client.transition_model_version_stage(\n", - " name=model_name,\n", - " version=model_version.version,\n", - " stage=\"Production\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f284978e-9c9b-4cd2-8bdc-1393991c5701", - "showTitle": false, - "title": "" - } - }, - "source": [ - "The Models page now shows the model version in stage \"Production\".\n", - "\n", - "You can now refer to the model using the path \"models:/wine_quality/production\"." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "58b22211-62ea-4fdd-a6ff-c552a65fa6e9", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "model = mlflow.pyfunc.load_model(f\"models:/{model_name}/production\")\n", - "\n", - "# Sanity-check: This should match the AUC logged by MLflow\n", - "print(f'AUC: {roc_auc_score(y_test, model.predict(X_test))}')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "6bc8e62b-0ee7-42bf-838a-45c020259814", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Experiment with a new model\n", - "\n", - "The random forest model performed well even without hyperparameter tuning.\n", - "\n", - "The following code uses the xgboost library to train a more accurate model. It runs a parallel hyperparameter sweep to train multiple\n", - "models in parallel, using Hyperopt and SparkTrials. As before, the code tracks the performance of each parameter configuration with MLflow." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "01f69756-98aa-4e0e-b8bf-0a7d84dbdb05", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK\n", - "from hyperopt.pyll import scope\n", - "from math import exp\n", - "import mlflow.xgboost\n", - "import numpy as np\n", - "import xgboost as xgb\n", - "\n", - "search_space = {\n", - " 'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),\n", - " 'learning_rate': hp.loguniform('learning_rate', -3, 0),\n", - " 'reg_alpha': hp.loguniform('reg_alpha', -5, -1),\n", - " 'reg_lambda': hp.loguniform('reg_lambda', -6, -1),\n", - " 'min_child_weight': hp.loguniform('min_child_weight', -1, 3),\n", - " 'objective': 'binary:logistic',\n", - " 'seed': 123, # Set a seed for deterministic training\n", - "}\n", - "\n", - "def train_model(params):\n", - " # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.\n", - " mlflow.xgboost.autolog()\n", - " with mlflow.start_run(nested=True):\n", - " train = xgb.DMatrix(data=X_train, label=y_train)\n", - " validation = xgb.DMatrix(data=X_val, label=y_val)\n", - " # Pass in the validation set so xgb can track an evaluation metric. XGBoost terminates training when the evaluation metric\n", - " # is no longer improving.\n", - " booster = xgb.train(params=params, dtrain=train, num_boost_round=1000,\\\n", - " evals=[(validation, \"validation\")], early_stopping_rounds=50)\n", - " validation_predictions = booster.predict(validation)\n", - " auc_score = roc_auc_score(y_val, validation_predictions)\n", - " mlflow.log_metric('auc', auc_score)\n", - "\n", - " signature = infer_signature(X_train, booster.predict(train))\n", - " mlflow.xgboost.log_model(booster, \"model\", signature=signature)\n", - " \n", - " # Set the loss to -1*auc_score so fmin maximizes the auc_score\n", - " return {'status': STATUS_OK, 'loss': -1*auc_score, 'booster': booster.attributes()}\n", - "\n", - "# Greater parallelism will lead to speedups, but a less optimal hyperparameter sweep. \n", - "# A reasonable value for parallelism is the square root of max_evals.\n", - "spark_trials = SparkTrials(parallelism=10)\n", - "\n", - "# Run fmin within an MLflow run context so that each hyperparameter configuration is logged as a child run of a parent\n", - "# run called \"xgboost_models\" .\n", - "with mlflow.start_run(run_name='xgboost_models'):\n", - " best_params = fmin(\n", - " fn=train_model, \n", - " space=search_space, \n", - " algo=tpe.suggest, \n", - " max_evals=96,\n", - " trials=spark_trials,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "012e3c74-236f-4166-bb15-b129eb01181a", - "showTitle": false, - "title": "" - } - }, - "source": [ - "#### Use MLflow to view the results\n", - "Open up the Experiment Runs sidebar to see the MLflow runs. Click on Date next to the down arrow to display a menu, and select 'auc' to display the runs sorted by the auc metric. The highest auc value is 0.90.\n", - "\n", - "MLflow tracks the parameters and performance metrics of each run. Click the External Link icon at the top of the Experiment Runs sidebar to navigate to the MLflow Runs Table." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "00e5ea89-0918-44b3-9fca-60f7f48fb6b5", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Now investigate how the hyperparameter choice correlates with AUC. Click the \"+\" icon to expand the parent run, then select all runs except the parent, and click \"Compare\". Select the Parallel Coordinates Plot.\n", - "\n", - "The Parallel Coordinates Plot is useful in understanding the impact of parameters on a metric. You can drag the pink slider bar at the upper right corner of the plot to highlight a subset of AUC values and the corresponding parameter values. The plot below highlights the highest AUC values:\n", - "\n", - "\n", - "\n", - "Notice that all of the top performing runs have a low value for reg_lambda and learning_rate. \n", - "\n", - "You could run another hyperparameter sweep to explore even lower values for these parameters. For simplicity, that step is not included in this example." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "1752fa82-55e2-457c-b6d3-d545483b1eae", - "showTitle": false, - "title": "" - } - }, - "source": [ - "You used MLflow to log the model produced by each hyperparameter configuration. The following code finds the best performing run and saves the model to Model Registry." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "ac2d08c5-5381-40e8-b8a2-a0737ca5f934", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "best_run = mlflow.search_runs(order_by=['metrics.auc DESC']).iloc[0]\n", - "print(f'AUC of Best Run: {best_run[\"metrics.auc\"]}')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "c34b4cf8-35f1-48e8-bc64-51b6b4cf0f70", - "showTitle": false, - "title": "" - } - }, - "source": [ - "#### Update the production `wine_quality` model in MLflow Model Registry\n", - "\n", - "Earlier, you saved the baseline model to Model Registry with the name `wine_quality`. Now that you have a created a more accurate model, update `wine_quality`." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "e7419396-4da9-4181-8a3d-613b80d3544b", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "new_model_version = mlflow.register_model(f\"runs:/{best_run.run_id}/model\", model_name)\n", - "\n", - "# Registering the model takes a few seconds, so add a small delay\n", - "time.sleep(15)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "faaceaa1-fb8c-4e72-80c6-b9c7c5815a18", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Click **Models** in the left sidebar to see that the `wine_quality` model now has two versions. \n", - "\n", - "The following code promotes the new version to production." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "49c2eaec-eaf5-4e38-ab58-1d029271e4b9", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Archive the old model version\n", - "client.transition_model_version_stage(\n", - " name=model_name,\n", - " version=model_version.version,\n", - " stage=\"Archived\"\n", - ")\n", - "\n", - "# Promote the new model version to Production\n", - "client.transition_model_version_stage(\n", - " name=model_name,\n", - " version=new_model_version.version,\n", - " stage=\"Production\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "11553e11-faea-43ae-aac1-10545a252e64", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Clients that call load_model now receive the new model." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "6c7a24e2-9eb2-432b-af35-e3bf3dc03a46", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# This code is the same as the last block of \"Building a Baseline Model\". No change is required for clients to get the new model!\n", - "model = mlflow.pyfunc.load_model(f\"models:/{model_name}/production\")\n", - "print(f'AUC: {roc_auc_score(y_test, model.predict(X_test))}')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "383276bc-bdfe-416f-956d-c45773af7a00", - "showTitle": false, - "title": "" - } - }, - "source": [ - "The auc value on the test set for the new model is 0.90. You beat the baseline!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "2be6ef67-12ae-479a-8ba9-2a9ff90224b3", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Batch inference\n", - "\n", - "There are many scenarios where you might want to evaluate a model on a corpus of new data. For example, you may have a fresh batch of data, or may need to compare the performance of two models on the same corpus of data.\n", - "\n", - "The following code evaluates the model on data stored in a Delta table, using Spark to run the computation in parallel." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "5b42b701-ccf1-4bd0-a9dd-1e705bcdd795", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# To simulate a new corpus of data, save the existing X_train data to a Delta table. \n", - "# In the real world, this would be a new batch of data.\n", - "spark_df = spark.createDataFrame(X_train)\n", - "# Replace with your username before running this cell.\n", - "table_path = \"dbfs://delta/wine_data\"\n", - "# Delete the contents of this path in case this cell has already been run\n", - "dbutils.fs.rm(table_path, True)\n", - "spark_df.write.format(\"delta\").save(table_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f1be915c-f0ea-4003-acc3-a56941da3ca8", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Load the model into a Spark UDF, so it can be applied to the Delta table." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f9a1e264-e441-46b5-b38c-b896f7d21b1f", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import mlflow.pyfunc\n", - "\n", - "apply_model_udf = mlflow.pyfunc.spark_udf(spark, f\"models:/{model_name}/production\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "e6bd7794-a6d3-4f87-8b06-11cdef5ac607", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Read the \"new data\" from Delta\n", - "new_data = spark.read.format(\"delta\").load(table_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "b09df20f-a6c7-40ba-a23d-cf158bc56cb0", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "display(new_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "aa22c7ec-d8c9-4cdd-a7f6-4ba366978cee", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from pyspark.sql.functions import struct\n", - "\n", - "# Apply the model to the new data\n", - "udf_inputs = struct(*(X_train.columns.tolist()))\n", - "\n", - "new_data = new_data.withColumn(\n", - " \"prediction\",\n", - " apply_model_udf(udf_inputs)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "e3437534-cf5f-4acb-bfb4-23de89f88ddf", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Each row now has an associated prediction. Note that the xgboost function does not output probabilities by default, so the predictions are not limited to the range [0, 1].\n", - "display(new_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "485cdc68-326a-4a17-ac48-5458413e8e8b", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Model serving\n", - "\n", - "To productionize the model for low latency predictions, use Databricks Model Serving ([AWS](https://docs.databricks.com/machine-learning/model-serving/index.html)|[Azure](https://docs.microsoft.com/azure/databricks/machine-learning/model-serving/index)) to deploy the model to an endpoint.\n", - "\n", - "The following code illustrates how to issue requests using a REST API to get predictions from the deployed model." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "99b2067d-cd94-4ec7-9216-ed36c6253fa8", - "showTitle": false, - "title": "" - } - }, - "source": [ - "You need a Databricks token to issue requests to your model endpoint. You can generate a token from the User Settings page (click Settings in the left sidebar). Copy the token into the next cell." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "06f073d5-9b77-41e1-8bcd-67f17e288d2d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"DATABRICKS_TOKEN\"] = \"\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "305d8fd0-3e43-4100-8ad3-a6f21eb8dddf", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Click **Models** in the left sidebar and navigate to the registered wine model. Click the serving tab, and then click **Enable Serving**.\n", - "\n", - "Then, under **Call The Model**, click the **Python** button to display a Python code snippet to issue requests. Copy the code into this notebook. It should look similar to the code in the next cell. \n", - "\n", - "You can use the token to make these requests from outside Databricks notebooks as well." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "d8962617-79a4-4962-9491-dd77dbb95dd9", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Replace with code snippet from the model serving page\n", - "import os\n", - "import requests\n", - "import pandas as pd\n", - "\n", - "def score_model(dataset: pd.DataFrame):\n", - " url = 'https:///model/wine_quality/Production/invocations'\n", - " headers = {'Authorization': f'Bearer {os.environ.get(\"DATABRICKS_TOKEN\")}'}\n", - " data_json = dataset.to_dict(orient='records')\n", - " response = requests.request(method='POST', headers=headers, url=url, json=data_json)\n", - " if response.status_code != 200:\n", - " raise Exception(f'Request failed with status {response.status_code}, {response.text}')\n", - " return response.json()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "feb6ed38-6684-492c-8d01-ab2352103246", - "showTitle": false, - "title": "" - } - }, - "source": [ - "The model predictions from the endpoint should agree with the results from locally evaluating the model." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "2906da04-5ef6-4366-8ac4-bfa93ec21658", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Model serving is designed for low-latency predictions on smaller batches of data\n", - "num_predictions = 5\n", - "served_predictions = score_model(X_test[:num_predictions])\n", - "model_evaluations = model.predict(X_test[:num_predictions])\n", - "# Compare the results from the deployed model and the trained model\n", - "pd.DataFrame({\n", - " \"Model Prediction\": model_evaluations,\n", - " \"Served Model Prediction\": served_predictions,\n", - "})" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "mlflow-end-to-end-example", - "widgets": {} - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f775ddb6-7daf-4f9f-99fa-7fc024351b47", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Training machine learning models on tabular data: an end-to-end example\n", + "\n", + "This tutorial covers the following steps:\n", + "- Import data from your local machine into the Databricks File System (DBFS)\n", + "- Visualize the data using Seaborn and matplotlib\n", + "- Run a parallel hyperparameter sweep to train machine learning models on the dataset\n", + "- Explore the results of the hyperparameter sweep with MLflow\n", + "- Register the best performing model in MLflow\n", + "- Apply the registered model to another dataset using a Spark UDF\n", + "- Set up model serving for low-latency requests\n", + "\n", + "In this example, you build a model to predict the quality of Portugese \"Vinho Verde\" wine based on the wine's physicochemical properties. \n", + "\n", + "The example uses a dataset from the UCI Machine Learning Repository, presented in [*Modeling wine preferences by data mining from physicochemical properties*](https://www.sciencedirect.com/science/article/pii/S0167923609001377?via%3Dihub) [Cortez et al., 2009].\n", + "\n", + "## Requirements\n", + "This notebook requires Databricks Runtime for Machine Learning. \n", + "If you are using Databricks Runtime 7.3 LTS ML or below, you must update the CloudPickle library. To do that, uncomment and run the `%pip install` command in Cmd 2." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3eab982e-0862-4138-b473-53bfc84bcf48", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# This command is only required if you are using a cluster running DBR 7.3 LTS ML or below. \n", + "#%pip install --upgrade cloudpickle" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "366dccef-47e4-466b-af76-53e29ad99472", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Import data\n", + " \n", + "In this section, you download a dataset from the web and upload it to Databricks File System (DBFS).\n", + "\n", + "1. Navigate to https://archive.ics.uci.edu/dataset/186/wine+quality and download the dataset to your local machine. The download contains two .csv files, `winequality-red.csv` and `winequality-white.csv`.\n", + "\n", + "1. From this Databricks notebook, select **File > Upload data to DBFS...**, and drag these files to the drag-and-drop target to upload them to the Databricks File System (DBFS). \n", + "\n", + " **Note**: if you don't have the **File > Upload data to DBFS...** option, you can load the dataset from the Databricks example datasets. Uncomment and run the last two lines in the following cell.\n", + "\n", + "1. Click **Next**. Auto-generated code to load the data appears. Under **Access Files from Notebooks**, select the **pandas** tab. Click **Copy** to copy the example code, and then click **Done**. \n", + "\n", + "1. Create a new cell, then paste in the sample code. It will look similar to the code shown in the following cell. Make these changes:\n", + " - Pass `sep=';'` to `pd.read_csv`\n", + " - Change the variable names from `df1` and `df2` to `white_wine` and `red_wine`, as shown in the following cell." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "fff57c10-e07e-47f2-b231-7ae3556ce157", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# import pandas as pd\n", + "\n", + "# white_wine = pd.read_csv(\"/dbfs/FileStore/shared_uploads/nick.switanek@microsoft.com/winequality_red.csv\", sep=';')\n", + "# red_wine = pd.read_csv(\"/dbfs/FileStore/shared_uploads/nick.switanek@microsoft.com/winequality_white.csv\", sep=';')" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a1e7323b-decf-43f0-ab1b-c3b5a5e0159e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# If you have the File > Upload Data menu option, follow the instructions in the previous cell to upload the data from your local machine.\n", + "# The generated code, including the required edits described in the previous cell, is shown here for reference.\n", + "\n", + "import pandas as pd\n", + "\n", + "# In the following lines, replace with your username.\n", + "# white_wine = pd.read_csv(\"/dbfs/FileStore/shared_uploads//winequality_white.csv\", sep=';')\n", + "# red_wine = pd.read_csv(\"/dbfs/FileStore/shared_uploads//winequality_red.csv\", sep=';')\n", + "\n", + "# If you do not have the File > Upload Data menu option, uncomment and run these lines to load the dataset.\n", + "\n", + "white_wine = pd.read_csv(\"/dbfs/databricks-datasets/wine-quality/winequality-white.csv\", sep=\";\")\n", + "red_wine = pd.read_csv(\"/dbfs/databricks-datasets/wine-quality/winequality-red.csv\", sep=\";\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "08eaf617-f2fd-4ea1-a054-633bd470b7f6", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Merge the two DataFrames into a single dataset, with a new binary feature \"is_red\" that indicates whether the wine is red or white." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "09e989d5-949f-4bec-9122-060b995a2f11", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "red_wine['is_red'] = 1\n", + "white_wine['is_red'] = 0\n", + "\n", + "data = pd.concat([red_wine, white_wine], axis=0)\n", + "\n", + "# Remove spaces from column names\n", + "data.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cf5ae58a-d4d4-4006-9a8d-bc7ae8131c12", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4ce0fcc0-6131-4db8-8497-5e525b7252c3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Visualize data\n", + "\n", + "Before training a model, explore the dataset using Seaborn and Matplotlib." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "31cf51f6-4bd1-4aa5-ac59-9deede4ccf86", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Plot a histogram of the dependent variable, quality." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5f38e522-42e9-4673-8bfa-43a5fc28ee6d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "sns.distplot(data.quality, kde=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "cce08fec-03ea-4d8a-846e-b602ea950be3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Looks like quality scores are normally distributed between 3 and 9. \n", + "\n", + "Define a wine as high quality if it has quality >= 7." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "501a185a-3125-4ac4-83c1-61d28b529eaf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "high_quality = (data.quality >= 7).astype(int)\n", + "data.quality = high_quality" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f7b8d9dc-4408-4a84-9a5d-3d187960d9fe", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Box plots are useful in noticing correlations between features and a binary label." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0ee658d2-fb77-451b-b119-f680eb8f345b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "dims = (3, 4)\n", + "\n", + "f, axes = plt.subplots(dims[0], dims[1], figsize=(25, 15))\n", + "axis_i, axis_j = 0, 0\n", + "for col in data.columns:\n", + " if col == 'is_red' or col == 'quality':\n", + " continue # Box plots cannot be used on indicator variables\n", + " sns.boxplot(x=high_quality, y=data[col], ax=axes[axis_i, axis_j])\n", + " axis_j += 1\n", + " if axis_j == dims[1]:\n", + " axis_i += 1\n", + " axis_j = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c379687d-03da-41aa-bdf5-953db5ec9534", + "showTitle": false, + "title": "" + } + }, + "source": [ + "In the above box plots, a few variables stand out as good univariate predictors of quality. \n", + "\n", + "- In the alcohol box plot, the median alcohol content of high quality wines is greater than even the 75th quantile of low quality wines. High alcohol content is correlated with quality.\n", + "- In the density box plot, low quality wines have a greater density than high quality wines. Density is inversely correlated with quality." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "27b6967c-5a25-4fe6-894c-4c07a63014d7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Preprocess data\n", + "Prior to training a model, check for missing values and split the data into training and validation sets." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "601c2717-c8a5-4411-ba78-e033a9bcdd83", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "data.isna().any()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "aff3310f-e556-44b6-8f7d-e547682e7677", + "showTitle": false, + "title": "" + } + }, + "source": [ + "There are no missing values." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "379abb9a-bd5b-46b9-8975-ce91858eba39", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Prepare dataset for training baseline model\n", + "Split the input data into 3 sets:\n", + "- Train (60% of the dataset used to train the model)\n", + "- Validation (20% of the dataset used to tune the hyperparameters)\n", + "- Test (20% of the dataset used to report the true performance of the model on an unseen dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e70c1c07-0048-4bb8-a0c3-17a7895bb4b2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X = data.drop([\"quality\"], axis=1)\n", + "y = data.quality\n", + "\n", + "# Split out the training data\n", + "X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=123)\n", + "\n", + "# Split the remaining data equally into validation and test\n", + "X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "224d0ddf-3925-4849-bc20-564175c212e8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Build a baseline model\n", + "This task seems well suited to a random forest classifier, since the output is binary and there may be interactions between multiple variables.\n", + "\n", + "The following code builds a simple classifier using scikit-learn. It uses MLflow to keep track of the model accuracy, and to save the model for later use." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "46af6002-8063-4ada-b7e5-6ce56c22f60a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import mlflow\n", + "import mlflow.pyfunc\n", + "import mlflow.sklearn\n", + "import numpy as np\n", + "import sklearn\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import roc_auc_score\n", + "from mlflow.models.signature import infer_signature\n", + "from mlflow.utils.environment import _mlflow_conda_env\n", + "import cloudpickle\n", + "import time\n", + "\n", + "# The predict method of sklearn's RandomForestClassifier returns a binary classification (0 or 1). \n", + "# The following code creates a wrapper function, SklearnModelWrapper, that uses \n", + "# the predict_proba method to return the probability that the observation belongs to each class. \n", + "\n", + "class SklearnModelWrapper(mlflow.pyfunc.PythonModel):\n", + " def __init__(self, model):\n", + " self.model = model\n", + " \n", + " def predict(self, context, model_input):\n", + " return self.model.predict_proba(model_input)[:,1]\n", + "\n", + "# mlflow.start_run creates a new MLflow run to track the performance of this model. \n", + "# Within the context, you call mlflow.log_param to keep track of the parameters used, and\n", + "# mlflow.log_metric to record metrics like accuracy.\n", + "with mlflow.start_run(run_name='untuned_random_forest'):\n", + " n_estimators = 10\n", + " model = RandomForestClassifier(n_estimators=n_estimators, random_state=np.random.RandomState(123))\n", + " model.fit(X_train, y_train)\n", + "\n", + " # predict_proba returns [prob_negative, prob_positive], so slice the output with [:, 1]\n", + " predictions_test = model.predict_proba(X_test)[:,1]\n", + " auc_score = roc_auc_score(y_test, predictions_test)\n", + " mlflow.log_param('n_estimators', n_estimators)\n", + " # Use the area under the ROC curve as a metric.\n", + " mlflow.log_metric('auc', auc_score)\n", + " wrappedModel = SklearnModelWrapper(model)\n", + " # Log the model with a signature that defines the schema of the model's inputs and outputs. \n", + " # When the model is deployed, this signature will be used to validate inputs.\n", + " signature = infer_signature(X_train, wrappedModel.predict(None, X_train))\n", + " \n", + " # MLflow contains utilities to create a conda environment used to serve models.\n", + " # The necessary dependencies are added to a conda.yaml file which is logged along with the model.\n", + " conda_env = _mlflow_conda_env(\n", + " additional_conda_deps=None,\n", + " additional_pip_deps=[\"cloudpickle=={}\".format(cloudpickle.__version__), \"scikit-learn=={}\".format(sklearn.__version__)],\n", + " additional_conda_channels=None,\n", + " )\n", + " mlflow.pyfunc.log_model(\"random_forest_model\", python_model=wrappedModel, conda_env=conda_env, signature=signature)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "867b3e23-9147-4ca4-b2d2-f1103c471d53", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Examine the learned feature importances output by the model as a sanity-check." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1e174abe-774b-461e-b390-212e11d1c68d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])\n", + "feature_importances.sort_values('importance', ascending=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "06a939b8-7232-464b-8370-2bc9e3a8c613", + "showTitle": false, + "title": "" + } + }, + "source": [ + "As illustrated by the boxplots shown previously, both alcohol and density are important in predicting quality." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f8981b9f-8833-4dc4-a1cb-b25f9f43a8c6", + "showTitle": false, + "title": "" + } + }, + "source": [ + "You logged the Area Under the ROC Curve (AUC) to MLflow. Click **Experiment** at the upper right to display the Experiment Runs sidebar. \n", + "\n", + "The model achieved an AUC of 0.854.\n", + "\n", + "A random classifier would have an AUC of 0.5, and higher AUC values are better. For more information, see [Receiver Operating Characteristic Curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8d1df4d8-4b50-47d8-ad89-d3c80ce960ee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Register the model in MLflow Model Registry\n", + "\n", + "By registering this model in Model Registry, you can easily reference the model from anywhere within Databricks.\n", + "\n", + "The following section shows how to do this programmatically, but you can also register a model using the UI. See \"[Create or register a model using the UI](https://docs.microsoft.com/azure/databricks/applications/machine-learning/manage-model-lifecycle/index#create-or-register-a-model-using-the-ui)\"." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9f7b7e14-b016-41c1-9852-aae0f6625e93", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "run_id = mlflow.search_runs(filter_string='tags.mlflow.runName = \"untuned_random_forest\"').iloc[0].run_id" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4774edbf-f3fb-495c-a638-ec63f275f885", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# If you see the error \"PERMISSION_DENIED: User does not have any permission level assigned to the registered model\", \n", + "# the cause may be that a model already exists with the name \"wine_quality\". Try using a different name.\n", + "model_name = \"wine_quality\"\n", + "model_version = mlflow.register_model(f\"runs:/{run_id}/random_forest_model\", model_name)\n", + "\n", + "# Registering the model takes a few seconds, so add a small delay\n", + "time.sleep(15)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "46f16d91-13ad-4e28-866f-3a979eb6fde1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "You should now see the model in the Models page. To display the Models page, click the Models icon in the left sidebar. \n", + "\n", + "Next, transition this model to production and load it into this notebook from Model Registry." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "cb032bd6-7e9d-4430-8c13-895ebc189297", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from mlflow.tracking import MlflowClient\n", + "\n", + "client = MlflowClient()\n", + "client.transition_model_version_stage(\n", + " name=model_name,\n", + " version=model_version.version,\n", + " stage=\"Production\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f284978e-9c9b-4cd2-8bdc-1393991c5701", + "showTitle": false, + "title": "" + } + }, + "source": [ + "The Models page now shows the model version in stage \"Production\".\n", + "\n", + "You can now refer to the model using the path \"models:/wine_quality/production\"." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "58b22211-62ea-4fdd-a6ff-c552a65fa6e9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "model = mlflow.pyfunc.load_model(f\"models:/{model_name}/production\")\n", + "\n", + "# Sanity-check: This should match the AUC logged by MLflow\n", + "print(f'AUC: {roc_auc_score(y_test, model.predict(X_test))}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6bc8e62b-0ee7-42bf-838a-45c020259814", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Experiment with a new model\n", + "\n", + "The random forest model performed well even without hyperparameter tuning.\n", + "\n", + "The following code uses the xgboost library to train a more accurate model. It runs a parallel hyperparameter sweep to train multiple\n", + "models in parallel, using Hyperopt and SparkTrials. As before, the code tracks the performance of each parameter configuration with MLflow." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "01f69756-98aa-4e0e-b8bf-0a7d84dbdb05", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK\n", + "from hyperopt.pyll import scope\n", + "from math import exp\n", + "import mlflow.xgboost\n", + "import numpy as np\n", + "import xgboost as xgb\n", + "\n", + "search_space = {\n", + " 'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),\n", + " 'learning_rate': hp.loguniform('learning_rate', -3, 0),\n", + " 'reg_alpha': hp.loguniform('reg_alpha', -5, -1),\n", + " 'reg_lambda': hp.loguniform('reg_lambda', -6, -1),\n", + " 'min_child_weight': hp.loguniform('min_child_weight', -1, 3),\n", + " 'objective': 'binary:logistic',\n", + " 'seed': 123, # Set a seed for deterministic training\n", + "}\n", + "\n", + "def train_model(params):\n", + " # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.\n", + " mlflow.xgboost.autolog()\n", + " with mlflow.start_run(nested=True):\n", + " train = xgb.DMatrix(data=X_train, label=y_train)\n", + " validation = xgb.DMatrix(data=X_val, label=y_val)\n", + " # Pass in the validation set so xgb can track an evaluation metric. XGBoost terminates training when the evaluation metric\n", + " # is no longer improving.\n", + " booster = xgb.train(params=params, dtrain=train, num_boost_round=1000,\\\n", + " evals=[(validation, \"validation\")], early_stopping_rounds=50)\n", + " validation_predictions = booster.predict(validation)\n", + " auc_score = roc_auc_score(y_val, validation_predictions)\n", + " mlflow.log_metric('auc', auc_score)\n", + "\n", + " signature = infer_signature(X_train, booster.predict(train))\n", + " mlflow.xgboost.log_model(booster, \"model\", signature=signature)\n", + " \n", + " # Set the loss to -1*auc_score so fmin maximizes the auc_score\n", + " return {'status': STATUS_OK, 'loss': -1*auc_score, 'booster': booster.attributes()}\n", + "\n", + "# Greater parallelism will lead to speedups, but a less optimal hyperparameter sweep. \n", + "# A reasonable value for parallelism is the square root of max_evals.\n", + "spark_trials = SparkTrials(parallelism=10)\n", + "\n", + "# Run fmin within an MLflow run context so that each hyperparameter configuration is logged as a child run of a parent\n", + "# run called \"xgboost_models\" .\n", + "with mlflow.start_run(run_name='xgboost_models'):\n", + " best_params = fmin(\n", + " fn=train_model, \n", + " space=search_space, \n", + " algo=tpe.suggest, \n", + " max_evals=96,\n", + " trials=spark_trials,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "012e3c74-236f-4166-bb15-b129eb01181a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Use MLflow to view the results\n", + "Open up the Experiment Runs sidebar to see the MLflow runs. Click on Date next to the down arrow to display a menu, and select 'auc' to display the runs sorted by the auc metric. The highest auc value is 0.90.\n", + "\n", + "MLflow tracks the parameters and performance metrics of each run. Click the External Link icon at the top of the Experiment Runs sidebar to navigate to the MLflow Runs Table." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "00e5ea89-0918-44b3-9fca-60f7f48fb6b5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Now investigate how the hyperparameter choice correlates with AUC. Click the \"+\" icon to expand the parent run, then select all runs except the parent, and click \"Compare\". Select the Parallel Coordinates Plot.\n", + "\n", + "The Parallel Coordinates Plot is useful in understanding the impact of parameters on a metric. You can drag the pink slider bar at the upper right corner of the plot to highlight a subset of AUC values and the corresponding parameter values. The plot below highlights the highest AUC values:\n", + "\n", + "\n", + "\n", + "Notice that all of the top performing runs have a low value for reg_lambda and learning_rate. \n", + "\n", + "You could run another hyperparameter sweep to explore even lower values for these parameters. For simplicity, that step is not included in this example." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1752fa82-55e2-457c-b6d3-d545483b1eae", + "showTitle": false, + "title": "" + } + }, + "source": [ + "You used MLflow to log the model produced by each hyperparameter configuration. The following code finds the best performing run and saves the model to Model Registry." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ac2d08c5-5381-40e8-b8a2-a0737ca5f934", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "best_run = mlflow.search_runs(order_by=['metrics.auc DESC']).iloc[0]\n", + "print(f'AUC of Best Run: {best_run[\"metrics.auc\"]}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c34b4cf8-35f1-48e8-bc64-51b6b4cf0f70", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Update the production `wine_quality` model in MLflow Model Registry\n", + "\n", + "Earlier, you saved the baseline model to Model Registry with the name `wine_quality`. Now that you have a created a more accurate model, update `wine_quality`." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e7419396-4da9-4181-8a3d-613b80d3544b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "new_model_version = mlflow.register_model(f\"runs:/{best_run.run_id}/model\", model_name)\n", + "\n", + "# Registering the model takes a few seconds, so add a small delay\n", + "time.sleep(15)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "faaceaa1-fb8c-4e72-80c6-b9c7c5815a18", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Click **Models** in the left sidebar to see that the `wine_quality` model now has two versions. \n", + "\n", + "The following code promotes the new version to production." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "49c2eaec-eaf5-4e38-ab58-1d029271e4b9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Archive the old model version\n", + "client.transition_model_version_stage(\n", + " name=model_name,\n", + " version=model_version.version,\n", + " stage=\"Archived\"\n", + ")\n", + "\n", + "# Promote the new model version to Production\n", + "client.transition_model_version_stage(\n", + " name=model_name,\n", + " version=new_model_version.version,\n", + " stage=\"Production\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "11553e11-faea-43ae-aac1-10545a252e64", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Clients that call load_model now receive the new model." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6c7a24e2-9eb2-432b-af35-e3bf3dc03a46", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# This code is the same as the last block of \"Building a Baseline Model\". No change is required for clients to get the new model!\n", + "model = mlflow.pyfunc.load_model(f\"models:/{model_name}/production\")\n", + "print(f'AUC: {roc_auc_score(y_test, model.predict(X_test))}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "383276bc-bdfe-416f-956d-c45773af7a00", + "showTitle": false, + "title": "" + } + }, + "source": [ + "The auc value on the test set for the new model is 0.90. You beat the baseline!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2be6ef67-12ae-479a-8ba9-2a9ff90224b3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Batch inference\n", + "\n", + "There are many scenarios where you might want to evaluate a model on a corpus of new data. For example, you may have a fresh batch of data, or may need to compare the performance of two models on the same corpus of data.\n", + "\n", + "The following code evaluates the model on data stored in a Delta table, using Spark to run the computation in parallel." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5b42b701-ccf1-4bd0-a9dd-1e705bcdd795", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# To simulate a new corpus of data, save the existing X_train data to a Delta table. \n", + "# In the real world, this would be a new batch of data.\n", + "spark_df = spark.createDataFrame(X_train)\n", + "# Replace with your username before running this cell.\n", + "table_path = \"dbfs://delta/wine_data\"\n", + "# Delete the contents of this path in case this cell has already been run\n", + "dbutils.fs.rm(table_path, True)\n", + "spark_df.write.format(\"delta\").save(table_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f1be915c-f0ea-4003-acc3-a56941da3ca8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Load the model into a Spark UDF, so it can be applied to the Delta table." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f9a1e264-e441-46b5-b38c-b896f7d21b1f", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import mlflow.pyfunc\n", + "\n", + "apply_model_udf = mlflow.pyfunc.spark_udf(spark, f\"models:/{model_name}/production\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e6bd7794-a6d3-4f87-8b06-11cdef5ac607", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Read the \"new data\" from Delta\n", + "new_data = spark.read.format(\"delta\").load(table_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "b09df20f-a6c7-40ba-a23d-cf158bc56cb0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "display(new_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "aa22c7ec-d8c9-4cdd-a7f6-4ba366978cee", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import struct\n", + "\n", + "# Apply the model to the new data\n", + "udf_inputs = struct(*(X_train.columns.tolist()))\n", + "\n", + "new_data = new_data.withColumn(\n", + " \"prediction\",\n", + " apply_model_udf(udf_inputs)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e3437534-cf5f-4acb-bfb4-23de89f88ddf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Each row now has an associated prediction. Note that the xgboost function does not output probabilities by default, so the predictions are not limited to the range [0, 1].\n", + "display(new_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "485cdc68-326a-4a17-ac48-5458413e8e8b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Model serving\n", + "\n", + "To productionize the model for low latency predictions, use Databricks Model Serving ([AWS](https://docs.databricks.com/machine-learning/model-serving/index.html)|[Azure](https://docs.microsoft.com/azure/databricks/machine-learning/model-serving/index)) to deploy the model to an endpoint.\n", + "\n", + "The following code illustrates how to issue requests using a REST API to get predictions from the deployed model." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "99b2067d-cd94-4ec7-9216-ed36c6253fa8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "You need a Databricks token to issue requests to your model endpoint. You can generate a token from the User Settings page (click Settings in the left sidebar). Copy the token into the next cell." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "06f073d5-9b77-41e1-8bcd-67f17e288d2d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"DATABRICKS_TOKEN\"] = \"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "305d8fd0-3e43-4100-8ad3-a6f21eb8dddf", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Click **Models** in the left sidebar and navigate to the registered wine model. Click the serving tab, and then click **Enable Serving**.\n", + "\n", + "Then, under **Call The Model**, click the **Python** button to display a Python code snippet to issue requests. Copy the code into this notebook. It should look similar to the code in the next cell. \n", + "\n", + "You can use the token to make these requests from outside Databricks notebooks as well." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d8962617-79a4-4962-9491-dd77dbb95dd9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Replace with code snippet from the model serving page\n", + "import os\n", + "import requests\n", + "import pandas as pd\n", + "\n", + "def score_model(dataset: pd.DataFrame):\n", + " url = 'https:///model/wine_quality/Production/invocations'\n", + " headers = {'Authorization': f'Bearer {os.environ.get(\"DATABRICKS_TOKEN\")}'}\n", + " data_json = dataset.to_dict(orient='records')\n", + " response = requests.request(method='POST', headers=headers, url=url, json=data_json)\n", + " if response.status_code != 200:\n", + " raise Exception(f'Request failed with status {response.status_code}, {response.text}')\n", + " return response.json()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "feb6ed38-6684-492c-8d01-ab2352103246", + "showTitle": false, + "title": "" + } + }, + "source": [ + "The model predictions from the endpoint should agree with the results from locally evaluating the model." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2906da04-5ef6-4366-8ac4-bfa93ec21658", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Model serving is designed for low-latency predictions on smaller batches of data\n", + "num_predictions = 5\n", + "served_predictions = score_model(X_test[:num_predictions])\n", + "model_evaluations = model.predict(X_test[:num_predictions])\n", + "# Compare the results from the deployed model and the trained model\n", + "pd.DataFrame({\n", + " \"Model Prediction\": model_evaluations,\n", + " \"Served Model Prediction\": served_predictions,\n", + "})" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "mlflow-end-to-end-example", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/MLOps-ADO-ADB/src/workshop/notebooks/part_0_create_datasets.ipynb b/MLOps-ADO-ADB/src/workshop/notebooks/part_0_create_datasets.ipynb index b9095354..086afdb0 100644 --- a/MLOps-ADO-ADB/src/workshop/notebooks/part_0_create_datasets.ipynb +++ b/MLOps-ADO-ADB/src/workshop/notebooks/part_0_create_datasets.ipynb @@ -1,349 +1,349 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f775ddb6-7daf-4f9f-99fa-7fc024351b47", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Part 0: Create Datasets for Training and Evaluation, and to mimic Production Data\n", - "\n", - "This notebook creates the datasets that are used in the workshop.\n", - "\n", - "- Import data from your local machine into the Databricks File System (DBFS)\n", - "\n", - "In this example, you build a model to predict the quality of Portugese \"Vinho Verde\" wine based on the wine's physicochemical properties. \n", - "\n", - "The example uses a dataset from the UCI Machine Learning Repository, presented in [*Modeling wine preferences by data mining from physicochemical properties*](https://www.sciencedirect.com/science/article/pii/S0167923609001377?via%3Dihub) [Cortez et al., 2009].\n", - "\n", - "## Requirements\n", - "This tutorial requires Databricks Runtime for Machine Learning." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "366dccef-47e4-466b-af76-53e29ad99472", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Import data\n", - " \n", - "In this section, you download a dataset from the web and save it to Databricks File System (DBFS).\n", - "For this tutorial, we will use a public dataset which can be found at: https://archive.ics.uci.edu/dataset/186/wine+quality\n", - "\n", - "Run the shell commands below to create a new directory in DBFS, download a `.zip` file with the data, and uncompress them to your directory" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "455183c1-b094-411e-a3ae-f88f1be3901b", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "%sh\n", - "mkdir -p /dbfs/tutorials/wine-data\n", - "wget https://archive.ics.uci.edu/static/public/186/wine+quality.zip -p -O /dbfs/tutorials/wine-data/wine-quality.zip\n", - "unzip -o /dbfs/tutorials/wine-data/wine-quality.zip -d /dbfs/tutorials/wine-data/" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "279cb6ec-2378-47ec-b7ac-8bcd234b206d", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Read the Data\n", - "\n", - "Now that we have the data downloaded, we can use regular Python pandas commands to read the files." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "a1e7323b-decf-43f0-ab1b-c3b5a5e0159e", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "white_wine = pd.read_csv(\"/dbfs/tutorials/wine-data/winequality-white.csv\", sep=\";\")\n", - "red_wine = pd.read_csv(\"/dbfs/tutorials/wine-data/winequality-red.csv\", sep=\";\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "4bc5ccba-d4f2-4f80-a2e5-42d586434a16", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Take a peek at the data to make sure everything was read as expected...\n", - "display(white_wine)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "08eaf617-f2fd-4ea1-a054-633bd470b7f6", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Merge the two DataFrames into a single dataset, with a new binary feature \"is_red\" that indicates whether the wine is red or white." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "09e989d5-949f-4bec-9122-060b995a2f11", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "red_wine['is_red'] = 1\n", - "white_wine['is_red'] = 0\n", - "\n", - "data = pd.concat([red_wine, white_wine], axis=0)\n", - "\n", - "# Remove spaces from column names\n", - "data.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "cf5ae58a-d4d4-4006-9a8d-bc7ae8131c12", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "data.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "0d7f141b-79e1-4aed-bb8f-4ed9ce0c9b8e", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Save the data for training and validation\n", - "\n", - "We will save our combined datasets to a new file so we can use it in later steps for training and validation." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "9c653745-4ae1-4e45-9233-dea939b95f4c", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "data.to_csv(\"/dbfs/tutorials/wine-data/wine-quality-all-raw.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "3a9a09bb-484a-40fe-b888-8aacd95d2729", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Save data to mimic production batch inference data\n", - "There are many scenarios where you might want to evaluate a model on a corpus of new data. For example, you may have a fresh batch of data, or may need to compare the performance of two models on the same corpus of data.\n", - "\n", - "To simulate a new corpus of data, save the a bootstrap resample of the X_train data to a Delta table. In the real world, this would be a new batch of data." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "517c4b25-4767-4e41-987a-04fccff18ad3", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# split the same as in the training notebook\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "# X = data.drop([\"quality\"], axis=1)\n", - "# y = data.quality\n", - "\n", - "# Split out the training data\n", - "# X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=123)\n", - "\n", - "# Split the remaining data equally into validation and test\n", - "# X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)\n", - "\n", - "high_quality = (data.quality >= 7).astype(int)\n", - "data.quality = high_quality\n", - "\n", - "# X_new_batch = X_train.sample(frac=1.0, replace=True, random_state=123)\n", - "\n", - "X_new_batch = data.sample(frac=1.0, replace=True, random_state=123)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "ffdf5cd0-7b8a-4965-b31f-672507e4b108", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "spark_df = spark.createDataFrame(X_new_batch)\n", - "table_path = \"dbfs:/tutorials/wine-data/delta\"\n", - "\n", - "# Delete the contents of this path in case this cell has already been run\n", - "dbutils.fs.rm(table_path, True)\n", - "spark_df.write.format(\"delta\").save(table_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "81138f6a-94d6-45b2-8adf-6b4dcc935199", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "mostRecentlyExecutedCommandWithImplicitDF": { - "commandId": 2240891600679675, - "dataframes": [ - "_sqldf" - ] - }, - "pythonIndentUnit": 2 - }, - "notebookName": "part_0_create_datasets", - "widgets": {} - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f775ddb6-7daf-4f9f-99fa-7fc024351b47", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Part 0: Create Datasets for Training and Evaluation, and to mimic Production Data\n", + "\n", + "This notebook creates the datasets that are used in the workshop.\n", + "\n", + "- Import data from your local machine into the Databricks File System (DBFS)\n", + "\n", + "In this example, you build a model to predict the quality of Portugese \"Vinho Verde\" wine based on the wine's physicochemical properties. \n", + "\n", + "The example uses a dataset from the UCI Machine Learning Repository, presented in [*Modeling wine preferences by data mining from physicochemical properties*](https://www.sciencedirect.com/science/article/pii/S0167923609001377?via%3Dihub) [Cortez et al., 2009].\n", + "\n", + "## Requirements\n", + "This tutorial requires Databricks Runtime for Machine Learning." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "366dccef-47e4-466b-af76-53e29ad99472", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Import data\n", + " \n", + "In this section, you download a dataset from the web and save it to Databricks File System (DBFS).\n", + "For this tutorial, we will use a public dataset which can be found at: https://archive.ics.uci.edu/dataset/186/wine+quality\n", + "\n", + "Run the shell commands below to create a new directory in DBFS, download a `.zip` file with the data, and uncompress them to your directory" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "455183c1-b094-411e-a3ae-f88f1be3901b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "%sh\n", + "mkdir -p /dbfs/tutorials/wine-data\n", + "wget https://archive.ics.uci.edu/static/public/186/wine+quality.zip -p -O /dbfs/tutorials/wine-data/wine-quality.zip\n", + "unzip -o /dbfs/tutorials/wine-data/wine-quality.zip -d /dbfs/tutorials/wine-data/" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "279cb6ec-2378-47ec-b7ac-8bcd234b206d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Read the Data\n", + "\n", + "Now that we have the data downloaded, we can use regular Python pandas commands to read the files." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a1e7323b-decf-43f0-ab1b-c3b5a5e0159e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "white_wine = pd.read_csv(\"/dbfs/tutorials/wine-data/winequality-white.csv\", sep=\";\")\n", + "red_wine = pd.read_csv(\"/dbfs/tutorials/wine-data/winequality-red.csv\", sep=\";\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4bc5ccba-d4f2-4f80-a2e5-42d586434a16", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Take a peek at the data to make sure everything was read as expected...\n", + "display(white_wine)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "08eaf617-f2fd-4ea1-a054-633bd470b7f6", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Merge the two DataFrames into a single dataset, with a new binary feature \"is_red\" that indicates whether the wine is red or white." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "09e989d5-949f-4bec-9122-060b995a2f11", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "red_wine['is_red'] = 1\n", + "white_wine['is_red'] = 0\n", + "\n", + "data = pd.concat([red_wine, white_wine], axis=0)\n", + "\n", + "# Remove spaces from column names\n", + "data.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cf5ae58a-d4d4-4006-9a8d-bc7ae8131c12", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0d7f141b-79e1-4aed-bb8f-4ed9ce0c9b8e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Save the data for training and validation\n", + "\n", + "We will save our combined datasets to a new file so we can use it in later steps for training and validation." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9c653745-4ae1-4e45-9233-dea939b95f4c", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "data.to_csv(\"/dbfs/tutorials/wine-data/wine-quality-all-raw.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3a9a09bb-484a-40fe-b888-8aacd95d2729", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Save data to mimic production batch inference data\n", + "There are many scenarios where you might want to evaluate a model on a corpus of new data. For example, you may have a fresh batch of data, or may need to compare the performance of two models on the same corpus of data.\n", + "\n", + "To simulate a new corpus of data, save the a bootstrap resample of the X_train data to a Delta table. In the real world, this would be a new batch of data." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "517c4b25-4767-4e41-987a-04fccff18ad3", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# split the same as in the training notebook\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# X = data.drop([\"quality\"], axis=1)\n", + "# y = data.quality\n", + "\n", + "# Split out the training data\n", + "# X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=123)\n", + "\n", + "# Split the remaining data equally into validation and test\n", + "# X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)\n", + "\n", + "high_quality = (data.quality >= 7).astype(int)\n", + "data.quality = high_quality\n", + "\n", + "# X_new_batch = X_train.sample(frac=1.0, replace=True, random_state=123)\n", + "\n", + "X_new_batch = data.sample(frac=1.0, replace=True, random_state=123)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ffdf5cd0-7b8a-4965-b31f-672507e4b108", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "spark_df = spark.createDataFrame(X_new_batch)\n", + "table_path = \"dbfs:/tutorials/wine-data/delta\"\n", + "\n", + "# Delete the contents of this path in case this cell has already been run\n", + "dbutils.fs.rm(table_path, True)\n", + "spark_df.write.format(\"delta\").save(table_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "81138f6a-94d6-45b2-8adf-6b4dcc935199", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 2240891600679675, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "part_0_create_datasets", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/MLOps-ADO-ADB/src/workshop/notebooks/part_1_1_data_prep.ipynb b/MLOps-ADO-ADB/src/workshop/notebooks/part_1_1_data_prep.ipynb index 42c076ce..14b28473 100644 --- a/MLOps-ADO-ADB/src/workshop/notebooks/part_1_1_data_prep.ipynb +++ b/MLOps-ADO-ADB/src/workshop/notebooks/part_1_1_data_prep.ipynb @@ -1,356 +1,356 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f775ddb6-7daf-4f9f-99fa-7fc024351b47", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Part 1: Data Prep\n", - "\n", - "This part of the tutorial covers the following steps:\n", - "- Visualize the data using Seaborn and matplotlib\n", - "- Construct a new binary outcome variable labeling higher quality wines.\n", - "\n", - "This notebook is designed to focus on data preparation as a set of concerns distinct from other parts of the model development workflow." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "87ced92f-f3e3-477f-bfb5-0be8762819ac", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Multiple people may be running this workshop at the same time. We want each\n", - "# participant to have their own set of files. To create your own file storage area,\n", - "# put your name below:\n", - "\n", - "your_name = \"\"\n", - "\n", - "try: run_name = dbutils.widgets.get(\"run_name\")\n", - "except: run_name = your_name.strip()\n", - "run_name = \"no_name\" if run_name == \"\" else run_name" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "2cb8367c-f070-4c8f-9ed8-1779689e54c9", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Load data\n", - "\n", - "This notebook assumes that a set of data is available for iterative training and validation. That dataset may be the output of some other ETL or data engineering process. The data science team will do some additional work to prepare the data for model training, which could include data quality checks, feature engineering, and target variable engineering." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "a1f9b355-aa7c-40f3-9967-f809a7e33e5d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "data = pd.read_csv(\"/dbfs/tutorials/wine-data/wine-quality-all-raw.csv\")\n", - "data = data.drop([\"Unnamed: 0\"], axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "4ce0fcc0-6131-4db8-8497-5e525b7252c3", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Visualize data\n", - "\n", - "Before training a model, explore the dataset using Seaborn and Matplotlib.\n", - "Although this notebook will eventually be run by an automated process, namely an Azure Pipeline triggered by a version-controlled code change and running \"headless\" and without human review, while we're developing the data prep code we'll typically plot variables to inform our decisions about what data preparation is necessary before handing off to model training." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "31cf51f6-4bd1-4aa5-ac59-9deede4ccf86", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Plot a histogram of the dependent variable, quality." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "5f38e522-42e9-4673-8bfa-43a5fc28ee6d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import seaborn as sns\n", - "sns.distplot(data.quality, kde=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "cce08fec-03ea-4d8a-846e-b602ea950be3", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Looks like quality scores are normally distributed between 3 and 9. \n", - "\n", - "Define a wine as high quality if it has quality >= 7.\n", - "\n", - "Again, this notebook focuses on the preprocessing of the data in order to prepare it for modeling. In this example data prep notebook, the only substantive work done is creating this new binary variable, `quality`." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "501a185a-3125-4ac4-83c1-61d28b529eaf", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "high_quality = (data.quality >= 7).astype(int)\n", - "data.quality = high_quality" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f7b8d9dc-4408-4a84-9a5d-3d187960d9fe", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Box plots are useful in noticing correlations between features and a binary label." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "0ee658d2-fb77-451b-b119-f680eb8f345b", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "dims = (3, 4)\n", - "\n", - "f, axes = plt.subplots(dims[0], dims[1], figsize=(25, 15))\n", - "axis_i, axis_j = 0, 0\n", - "for col in data.columns:\n", - " if col == 'is_red' or col == 'quality':\n", - " continue # Box plots cannot be used on indicator variables\n", - " sns.boxplot(x=high_quality, y=data[col], ax=axes[axis_i, axis_j])\n", - " axis_j += 1\n", - " if axis_j == dims[1]:\n", - " axis_i += 1\n", - " axis_j = 0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "c379687d-03da-41aa-bdf5-953db5ec9534", - "showTitle": false, - "title": "" - } - }, - "source": [ - "In the above box plots, a few variables stand out as good univariate predictors of quality. \n", - "\n", - "- In the alcohol box plot, the median alcohol content of high quality wines is greater than even the 75th quantile of low quality wines. High alcohol content is correlated with quality.\n", - "- In the density box plot, low quality wines have a greater density than high quality wines. Density is inversely correlated with quality." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "27b6967c-5a25-4fe6-894c-4c07a63014d7", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Preprocess data\n", - "Prior to training a model, check for missing values and split the data into training and validation sets." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "601c2717-c8a5-4411-ba78-e033a9bcdd83", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "data.isna().any()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "aff3310f-e556-44b6-8f7d-e547682e7677", - "showTitle": false, - "title": "" - } - }, - "source": [ - "There are no missing values." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "ef54ea94-8dd2-4526-ab57-a63d12daf763", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Save prepped and checked data" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "b1992b0b-80f0-4123-915b-7fa386c27ed5", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "dbutils.fs.mkdirs(f\"/tutorials/wine-data/{run_name}\")\n", - "data.to_csv(f\"/dbfs/tutorials/wine-data/{run_name}/wine-quality-all-prepped.csv\")" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "part_1_1_data_prep", - "widgets": {} - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f775ddb6-7daf-4f9f-99fa-7fc024351b47", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Part 1: Data Prep\n", + "\n", + "This part of the tutorial covers the following steps:\n", + "- Visualize the data using Seaborn and matplotlib\n", + "- Construct a new binary outcome variable labeling higher quality wines.\n", + "\n", + "This notebook is designed to focus on data preparation as a set of concerns distinct from other parts of the model development workflow." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "87ced92f-f3e3-477f-bfb5-0be8762819ac", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Multiple people may be running this workshop at the same time. We want each\n", + "# participant to have their own set of files. To create your own file storage area,\n", + "# put your name below:\n", + "\n", + "your_name = \"\"\n", + "\n", + "try: run_name = dbutils.widgets.get(\"run_name\")\n", + "except: run_name = your_name.strip()\n", + "run_name = \"no_name\" if run_name == \"\" else run_name" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2cb8367c-f070-4c8f-9ed8-1779689e54c9", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Load data\n", + "\n", + "This notebook assumes that a set of data is available for iterative training and validation. That dataset may be the output of some other ETL or data engineering process. The data science team will do some additional work to prepare the data for model training, which could include data quality checks, feature engineering, and target variable engineering." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a1f9b355-aa7c-40f3-9967-f809a7e33e5d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "data = pd.read_csv(\"/dbfs/tutorials/wine-data/wine-quality-all-raw.csv\")\n", + "data = data.drop([\"Unnamed: 0\"], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4ce0fcc0-6131-4db8-8497-5e525b7252c3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Visualize data\n", + "\n", + "Before training a model, explore the dataset using Seaborn and Matplotlib.\n", + "Although this notebook will eventually be run by an automated process, namely an Azure Pipeline triggered by a version-controlled code change and running \"headless\" and without human review, while we're developing the data prep code we'll typically plot variables to inform our decisions about what data preparation is necessary before handing off to model training." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "31cf51f6-4bd1-4aa5-ac59-9deede4ccf86", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Plot a histogram of the dependent variable, quality." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5f38e522-42e9-4673-8bfa-43a5fc28ee6d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "sns.distplot(data.quality, kde=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "cce08fec-03ea-4d8a-846e-b602ea950be3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Looks like quality scores are normally distributed between 3 and 9. \n", + "\n", + "Define a wine as high quality if it has quality >= 7.\n", + "\n", + "Again, this notebook focuses on the preprocessing of the data in order to prepare it for modeling. In this example data prep notebook, the only substantive work done is creating this new binary variable, `quality`." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "501a185a-3125-4ac4-83c1-61d28b529eaf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "high_quality = (data.quality >= 7).astype(int)\n", + "data.quality = high_quality" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f7b8d9dc-4408-4a84-9a5d-3d187960d9fe", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Box plots are useful in noticing correlations between features and a binary label." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0ee658d2-fb77-451b-b119-f680eb8f345b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "dims = (3, 4)\n", + "\n", + "f, axes = plt.subplots(dims[0], dims[1], figsize=(25, 15))\n", + "axis_i, axis_j = 0, 0\n", + "for col in data.columns:\n", + " if col == 'is_red' or col == 'quality':\n", + " continue # Box plots cannot be used on indicator variables\n", + " sns.boxplot(x=high_quality, y=data[col], ax=axes[axis_i, axis_j])\n", + " axis_j += 1\n", + " if axis_j == dims[1]:\n", + " axis_i += 1\n", + " axis_j = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c379687d-03da-41aa-bdf5-953db5ec9534", + "showTitle": false, + "title": "" + } + }, + "source": [ + "In the above box plots, a few variables stand out as good univariate predictors of quality. \n", + "\n", + "- In the alcohol box plot, the median alcohol content of high quality wines is greater than even the 75th quantile of low quality wines. High alcohol content is correlated with quality.\n", + "- In the density box plot, low quality wines have a greater density than high quality wines. Density is inversely correlated with quality." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "27b6967c-5a25-4fe6-894c-4c07a63014d7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Preprocess data\n", + "Prior to training a model, check for missing values and split the data into training and validation sets." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "601c2717-c8a5-4411-ba78-e033a9bcdd83", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "data.isna().any()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "aff3310f-e556-44b6-8f7d-e547682e7677", + "showTitle": false, + "title": "" + } + }, + "source": [ + "There are no missing values." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ef54ea94-8dd2-4526-ab57-a63d12daf763", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Save prepped and checked data" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b1992b0b-80f0-4123-915b-7fa386c27ed5", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "dbutils.fs.mkdirs(f\"/tutorials/wine-data/{run_name}\")\n", + "data.to_csv(f\"/dbfs/tutorials/wine-data/{run_name}/wine-quality-all-prepped.csv\")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "part_1_1_data_prep", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/MLOps-ADO-ADB/src/workshop/notebooks/part_1_2_training.ipynb b/MLOps-ADO-ADB/src/workshop/notebooks/part_1_2_training.ipynb index 9c262e80..35ba3772 100644 --- a/MLOps-ADO-ADB/src/workshop/notebooks/part_1_2_training.ipynb +++ b/MLOps-ADO-ADB/src/workshop/notebooks/part_1_2_training.ipynb @@ -1,524 +1,524 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f775ddb6-7daf-4f9f-99fa-7fc024351b47", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Part 1: Model Training\n", - "\n", - "The Part 0 notebook sets up a raw file to be used for model training and validation.\n", - "\n", - "The Part 1 Data Prep notebook handles data preparation and quality checking steps. \n", - "\n", - "This notebook focuses on model training, making use of the data prepared in the data prep notebook that will run as a previous job in the workflow.\n", - "\n", - "The notebook will handle the following steps:\n", - "- Split the prepared data into training and validation datasets.\n", - "- Build a simple classifier to predict wine quality based on the available features in the data.\n", - "- Register the model in MLflow as the baseline model that we'll try to beat by changing parts of the model development workflow\n", - "\n", - "When these steps complete, we have a baseline model that can generate predictions of the quality of Portugese wines based on the wine's measured physicochemical properties. \n", - "\n", - "## Requirements\n", - "This tutorial requires Databricks Runtime for Machine Learning." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "c482f0a0-9ba5-455c-846e-9a93734a18d5", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Multiple people may be running this workshop at the same time. We want each\n", - "# participant to have their own set of files. To create your own file storage area,\n", - "# put your name below:\n", - "\n", - "your_name = \"\"\n", - "\n", - "try: run_name = dbutils.widgets.get(\"run_name\")\n", - "except: run_name = your_name.strip()\n", - "run_name = \"no_name\" if run_name == \"\" else run_name" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "953f4f1f-405b-44e1-863b-600e6f2bc94f", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Load the prepared data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "4bf02fb6-5f66-4a73-a166-d51137f7509c", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "data = pd.read_csv(f\"/dbfs/tutorials/wine-data/{run_name}/wine-quality-all-prepped.csv\")\n", - "data = data.drop([\"Unnamed: 0\"], axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "379abb9a-bd5b-46b9-8975-ce91858eba39", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Split dataset for training baseline model\n", - "Split the input data into 3 sets:\n", - "- Train (60% of the dataset used to train the model)\n", - "- Validation (20% of the dataset used to tune the hyperparameters)\n", - "- Test (20% of the dataset used to report the true performance of the model on an unseen dataset)\n", - "\n", - "The test dataset will not be used in this model training notebook." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "e70c1c07-0048-4bb8-a0c3-17a7895bb4b2", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "X = data.drop([\"quality\"], axis=1)\n", - "y = data.quality\n", - "\n", - "# Split out the training data\n", - "X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=123)\n", - "\n", - "# Split the remaining data equally into validation and test\n", - "X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "224d0ddf-3925-4849-bc20-564175c212e8", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Build a baseline model\n", - "This task seems well suited to a random forest classifier, since the output is binary and there may be interactions between multiple variables.\n", - "\n", - "The following code builds a simple classifier using scikit-learn. It uses MLflow to keep track of the model accuracy, and to save the model for later use." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "ea321ffb-7fc4-4f8a-9517-157b6c823341", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "%sh\n", - "mkdir -p /Workspace/Shared/wine_quality/experiments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "46af6002-8063-4ada-b7e5-6ce56c22f60a", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import mlflow\n", - "import mlflow.pyfunc\n", - "import mlflow.sklearn\n", - "import numpy as np\n", - "import sklearn\n", - "# from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.metrics import roc_auc_score\n", - "from mlflow.models.signature import infer_signature\n", - "from mlflow.utils.environment import _mlflow_conda_env\n", - "import cloudpickle\n", - "import time\n", - "\n", - "# The predict method of sklearn's RandomForestClassifier returns a binary classification (0 or 1). \n", - "# The following code creates a wrapper function, SklearnModelWrapper, that uses \n", - "# the predict_proba method to return the probability that the observation belongs to each class. \n", - "\n", - "class SklearnModelWrapper(mlflow.pyfunc.PythonModel):\n", - " def __init__(self, model):\n", - " self.model = model\n", - " \n", - " def predict(self, context, model_input):\n", - " return self.model.predict_proba(model_input)[:,1]\n", - "\n", - "mlflow.set_experiment(f\"/Shared/wine_quality/experiments/{run_name}\")\n", - "# mlflow.start_run creates a new MLflow run to track the performance of this model. \n", - "# Within the context, you call mlflow.log_param to keep track of the parameters used, and\n", - "# mlflow.log_metric to record metrics like accuracy.\n", - "with mlflow.start_run(run_name='untuned_random_forest'):\n", - " n_estimators = 10\n", - " model = RandomForestClassifier(n_estimators=n_estimators, random_state=np.random.RandomState(123))\n", - " model.fit(X_train, y_train)\n", - "\n", - " # predict_proba returns [prob_negative, prob_positive], so slice the output with [:, 1]\n", - " predictions_test = model.predict_proba(X_test)[:,1]\n", - " auc_score = roc_auc_score(y_test, predictions_test)\n", - " mlflow.log_param('n_estimators', n_estimators)\n", - " # Use the area under the ROC curve as a metric.\n", - " mlflow.log_metric('auc', auc_score)\n", - " wrappedModel = SklearnModelWrapper(model)\n", - " # Log the model with a signature that defines the schema of the model's inputs and outputs. \n", - " # When the model is deployed, this signature will be used to validate inputs.\n", - " signature = infer_signature(X_train, wrappedModel.predict(None, X_train))\n", - " \n", - " # MLflow contains utilities to create a conda environment used to serve models.\n", - " # The necessary dependencies are added to a conda.yaml file which is logged along with the model.\n", - " conda_env = _mlflow_conda_env(\n", - " additional_conda_deps=None,\n", - " additional_pip_deps=[\"cloudpickle=={}\".format(cloudpickle.__version__), \"scikit-learn=={}\".format(sklearn.__version__)],\n", - " additional_conda_channels=None,\n", - " )\n", - " mlflow.pyfunc.log_model(\"model\", python_model=wrappedModel, conda_env=conda_env, signature=signature)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "867b3e23-9147-4ca4-b2d2-f1103c471d53", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Examine the learned feature importances output by the model as a sanity-check." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "1e174abe-774b-461e-b390-212e11d1c68d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])\n", - "feature_importances.sort_values('importance', ascending=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "06a939b8-7232-464b-8370-2bc9e3a8c613", - "showTitle": false, - "title": "" - } - }, - "source": [ - "As illustrated by the boxplots shown previously, both alcohol and density are important in predicting quality." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f8981b9f-8833-4dc4-a1cb-b25f9f43a8c6", - "showTitle": false, - "title": "" - } - }, - "source": [ - "You logged the Area Under the ROC Curve (AUC) to MLflow. Click **Experiment** at the upper right to display the Experiment Runs sidebar. \n", - "\n", - "The model achieved an AUC of 0.854.\n", - "\n", - "A random classifier would have an AUC of 0.5, and higher AUC values are better. For more information, see [Receiver Operating Characteristic Curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "8d1df4d8-4b50-47d8-ad89-d3c80ce960ee", - "showTitle": false, - "title": "" - } - }, - "source": [ - "#### Register the model in MLflow Model Registry\n", - "\n", - "By registering this model in Model Registry, you can easily reference the model from anywhere within Databricks.\n", - "\n", - "The following section shows how to do this programmatically, but you can also register a model using the UI. See \"[Create or register a model using the UI](https://docs.microsoft.com/azure/databricks/applications/machine-learning/manage-model-lifecycle/index#create-or-register-a-model-using-the-ui)\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "9f7b7e14-b016-41c1-9852-aae0f6625e93", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "run_id = mlflow.search_runs(filter_string='tags.mlflow.runName = \"untuned_random_forest\"').iloc[0].run_id\n", - "\n", - "# uncomment when incorporating hyperparameter search code in Part 4\n", - "# best_run = mlflow.search_runs(order_by=['metrics.auc DESC']).iloc[0]\n", - "# print(f'AUC of Best Run: {best_run[\"metrics.auc\"]}')\n", - "# run_id = best_run.run_id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "4774edbf-f3fb-495c-a638-ec63f275f885", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# If you see the error \"PERMISSION_DENIED: User does not have any permission level assigned to the registered model\", \n", - "# the cause may be that a model already exists with the name \"wine_quality\". Try using a different name.\n", - "\n", - "# to create your own version of the model, uncomment the next line, and comment the line after\n", - "# model_name = f\"wine_quality-{run_name}\"\n", - "model_name = \"wine_quality\"\n", - "model_version = mlflow.register_model(f\"runs:/{run_id}/model\", model_name)\n", - "\n", - "# Registering the model takes a few seconds, so add a small delay\n", - "time.sleep(15)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "46f16d91-13ad-4e28-866f-3a979eb6fde1", - "showTitle": false, - "title": "" - } - }, - "source": [ - "You should now see the model in the Models page. To display the Models page, click the Models icon in the left sidebar. \n", - "\n", - "Next, transition this model to staging and load it into this notebook from Model Registry." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "cb032bd6-7e9d-4430-8c13-895ebc189297", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from mlflow.tracking import MlflowClient\n", - "\n", - "client = MlflowClient()\n", - "client.transition_model_version_stage(\n", - " name=model_name,\n", - " version=model_version.version,\n", - " stage=\"Staging\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f284978e-9c9b-4cd2-8bdc-1393991c5701", - "showTitle": false, - "title": "" - } - }, - "source": [ - "The Models page now shows the model version in stage \"Staging\".\n", - "\n", - "You can now refer to the model using the path \"models:/wine_quality-{yourname}/staging\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "58b22211-62ea-4fdd-a6ff-c552a65fa6e9", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "model = mlflow.pyfunc.load_model(f\"models:/{model_name}/staging\")\n", - "\n", - "# Sanity-check: This should match the AUC logged by MLflow\n", - "print(f'AUC: {roc_auc_score(y_test, model.predict(X_test))}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "673ef4b4-6d74-4fe9-af77-a013148060ec", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "mostRecentlyExecutedCommandWithImplicitDF": { - "commandId": 1569140396715336, - "dataframes": [ - "_sqldf" - ] - }, - "pythonIndentUnit": 2 - }, - "notebookName": "part_1_2_training", - "widgets": {} - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f775ddb6-7daf-4f9f-99fa-7fc024351b47", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Part 1: Model Training\n", + "\n", + "The Part 0 notebook sets up a raw file to be used for model training and validation.\n", + "\n", + "The Part 1 Data Prep notebook handles data preparation and quality checking steps. \n", + "\n", + "This notebook focuses on model training, making use of the data prepared in the data prep notebook that will run as a previous job in the workflow.\n", + "\n", + "The notebook will handle the following steps:\n", + "- Split the prepared data into training and validation datasets.\n", + "- Build a simple classifier to predict wine quality based on the available features in the data.\n", + "- Register the model in MLflow as the baseline model that we'll try to beat by changing parts of the model development workflow\n", + "\n", + "When these steps complete, we have a baseline model that can generate predictions of the quality of Portugese wines based on the wine's measured physicochemical properties. \n", + "\n", + "## Requirements\n", + "This tutorial requires Databricks Runtime for Machine Learning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c482f0a0-9ba5-455c-846e-9a93734a18d5", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Multiple people may be running this workshop at the same time. We want each\n", + "# participant to have their own set of files. To create your own file storage area,\n", + "# put your name below:\n", + "\n", + "your_name = \"\"\n", + "\n", + "try: run_name = dbutils.widgets.get(\"run_name\")\n", + "except: run_name = your_name.strip()\n", + "run_name = \"no_name\" if run_name == \"\" else run_name" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "953f4f1f-405b-44e1-863b-600e6f2bc94f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Load the prepared data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4bf02fb6-5f66-4a73-a166-d51137f7509c", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "data = pd.read_csv(f\"/dbfs/tutorials/wine-data/{run_name}/wine-quality-all-prepped.csv\")\n", + "data = data.drop([\"Unnamed: 0\"], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "379abb9a-bd5b-46b9-8975-ce91858eba39", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Split dataset for training baseline model\n", + "Split the input data into 3 sets:\n", + "- Train (60% of the dataset used to train the model)\n", + "- Validation (20% of the dataset used to tune the hyperparameters)\n", + "- Test (20% of the dataset used to report the true performance of the model on an unseen dataset)\n", + "\n", + "The test dataset will not be used in this model training notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e70c1c07-0048-4bb8-a0c3-17a7895bb4b2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X = data.drop([\"quality\"], axis=1)\n", + "y = data.quality\n", + "\n", + "# Split out the training data\n", + "X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=123)\n", + "\n", + "# Split the remaining data equally into validation and test\n", + "X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "224d0ddf-3925-4849-bc20-564175c212e8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Build a baseline model\n", + "This task seems well suited to a random forest classifier, since the output is binary and there may be interactions between multiple variables.\n", + "\n", + "The following code builds a simple classifier using scikit-learn. It uses MLflow to keep track of the model accuracy, and to save the model for later use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ea321ffb-7fc4-4f8a-9517-157b6c823341", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "%sh\n", + "mkdir -p /Workspace/Shared/wine_quality/experiments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "46af6002-8063-4ada-b7e5-6ce56c22f60a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import mlflow\n", + "import mlflow.pyfunc\n", + "import mlflow.sklearn\n", + "import numpy as np\n", + "import sklearn\n", + "# from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import roc_auc_score\n", + "from mlflow.models.signature import infer_signature\n", + "from mlflow.utils.environment import _mlflow_conda_env\n", + "import cloudpickle\n", + "import time\n", + "\n", + "# The predict method of sklearn's RandomForestClassifier returns a binary classification (0 or 1). \n", + "# The following code creates a wrapper function, SklearnModelWrapper, that uses \n", + "# the predict_proba method to return the probability that the observation belongs to each class. \n", + "\n", + "class SklearnModelWrapper(mlflow.pyfunc.PythonModel):\n", + " def __init__(self, model):\n", + " self.model = model\n", + " \n", + " def predict(self, context, model_input):\n", + " return self.model.predict_proba(model_input)[:,1]\n", + "\n", + "mlflow.set_experiment(f\"/Shared/wine_quality/experiments/{run_name}\")\n", + "# mlflow.start_run creates a new MLflow run to track the performance of this model. \n", + "# Within the context, you call mlflow.log_param to keep track of the parameters used, and\n", + "# mlflow.log_metric to record metrics like accuracy.\n", + "with mlflow.start_run(run_name='untuned_random_forest'):\n", + " n_estimators = 10\n", + " model = RandomForestClassifier(n_estimators=n_estimators, random_state=np.random.RandomState(123))\n", + " model.fit(X_train, y_train)\n", + "\n", + " # predict_proba returns [prob_negative, prob_positive], so slice the output with [:, 1]\n", + " predictions_test = model.predict_proba(X_test)[:,1]\n", + " auc_score = roc_auc_score(y_test, predictions_test)\n", + " mlflow.log_param('n_estimators', n_estimators)\n", + " # Use the area under the ROC curve as a metric.\n", + " mlflow.log_metric('auc', auc_score)\n", + " wrappedModel = SklearnModelWrapper(model)\n", + " # Log the model with a signature that defines the schema of the model's inputs and outputs. \n", + " # When the model is deployed, this signature will be used to validate inputs.\n", + " signature = infer_signature(X_train, wrappedModel.predict(None, X_train))\n", + " \n", + " # MLflow contains utilities to create a conda environment used to serve models.\n", + " # The necessary dependencies are added to a conda.yaml file which is logged along with the model.\n", + " conda_env = _mlflow_conda_env(\n", + " additional_conda_deps=None,\n", + " additional_pip_deps=[\"cloudpickle=={}\".format(cloudpickle.__version__), \"scikit-learn=={}\".format(sklearn.__version__)],\n", + " additional_conda_channels=None,\n", + " )\n", + " mlflow.pyfunc.log_model(\"model\", python_model=wrappedModel, conda_env=conda_env, signature=signature)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "867b3e23-9147-4ca4-b2d2-f1103c471d53", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Examine the learned feature importances output by the model as a sanity-check." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1e174abe-774b-461e-b390-212e11d1c68d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])\n", + "feature_importances.sort_values('importance', ascending=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "06a939b8-7232-464b-8370-2bc9e3a8c613", + "showTitle": false, + "title": "" + } + }, + "source": [ + "As illustrated by the boxplots shown previously, both alcohol and density are important in predicting quality." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f8981b9f-8833-4dc4-a1cb-b25f9f43a8c6", + "showTitle": false, + "title": "" + } + }, + "source": [ + "You logged the Area Under the ROC Curve (AUC) to MLflow. Click **Experiment** at the upper right to display the Experiment Runs sidebar. \n", + "\n", + "The model achieved an AUC of 0.854.\n", + "\n", + "A random classifier would have an AUC of 0.5, and higher AUC values are better. For more information, see [Receiver Operating Characteristic Curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8d1df4d8-4b50-47d8-ad89-d3c80ce960ee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Register the model in MLflow Model Registry\n", + "\n", + "By registering this model in Model Registry, you can easily reference the model from anywhere within Databricks.\n", + "\n", + "The following section shows how to do this programmatically, but you can also register a model using the UI. See \"[Create or register a model using the UI](https://docs.microsoft.com/azure/databricks/applications/machine-learning/manage-model-lifecycle/index#create-or-register-a-model-using-the-ui)\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9f7b7e14-b016-41c1-9852-aae0f6625e93", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "run_id = mlflow.search_runs(filter_string='tags.mlflow.runName = \"untuned_random_forest\"').iloc[0].run_id\n", + "\n", + "# uncomment when incorporating hyperparameter search code in Part 4\n", + "# best_run = mlflow.search_runs(order_by=['metrics.auc DESC']).iloc[0]\n", + "# print(f'AUC of Best Run: {best_run[\"metrics.auc\"]}')\n", + "# run_id = best_run.run_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4774edbf-f3fb-495c-a638-ec63f275f885", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# If you see the error \"PERMISSION_DENIED: User does not have any permission level assigned to the registered model\", \n", + "# the cause may be that a model already exists with the name \"wine_quality\". Try using a different name.\n", + "\n", + "# to create your own version of the model, uncomment the next line, and comment the line after\n", + "# model_name = f\"wine_quality-{run_name}\"\n", + "model_name = \"wine_quality\"\n", + "model_version = mlflow.register_model(f\"runs:/{run_id}/model\", model_name)\n", + "\n", + "# Registering the model takes a few seconds, so add a small delay\n", + "time.sleep(15)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "46f16d91-13ad-4e28-866f-3a979eb6fde1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "You should now see the model in the Models page. To display the Models page, click the Models icon in the left sidebar. \n", + "\n", + "Next, transition this model to staging and load it into this notebook from Model Registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cb032bd6-7e9d-4430-8c13-895ebc189297", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from mlflow.tracking import MlflowClient\n", + "\n", + "client = MlflowClient()\n", + "client.transition_model_version_stage(\n", + " name=model_name,\n", + " version=model_version.version,\n", + " stage=\"Staging\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f284978e-9c9b-4cd2-8bdc-1393991c5701", + "showTitle": false, + "title": "" + } + }, + "source": [ + "The Models page now shows the model version in stage \"Staging\".\n", + "\n", + "You can now refer to the model using the path \"models:/wine_quality-{yourname}/staging\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "58b22211-62ea-4fdd-a6ff-c552a65fa6e9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "model = mlflow.pyfunc.load_model(f\"models:/{model_name}/staging\")\n", + "\n", + "# Sanity-check: This should match the AUC logged by MLflow\n", + "print(f'AUC: {roc_auc_score(y_test, model.predict(X_test))}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "673ef4b4-6d74-4fe9-af77-a013148060ec", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 1569140396715336, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "part_1_2_training", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/MLOps-ADO-ADB/src/workshop/notebooks/part_1_3_evaluating.ipynb b/MLOps-ADO-ADB/src/workshop/notebooks/part_1_3_evaluating.ipynb index b39cfeb1..5919cf06 100644 --- a/MLOps-ADO-ADB/src/workshop/notebooks/part_1_3_evaluating.ipynb +++ b/MLOps-ADO-ADB/src/workshop/notebooks/part_1_3_evaluating.ipynb @@ -1,412 +1,412 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f775ddb6-7daf-4f9f-99fa-7fc024351b47", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Part 1: Evaluating\n", - "\n", - "The Part 1 Data Prep notebook handles data preparation and quality checking steps. \n", - "\n", - "The Part 1 Model Training notebook builds a model and writes metrics to MLflow. \n", - "\n", - "This notebook will handle the following steps:\n", - "- Load the test data.\n", - "- Load the model registered to staging in the training step.\n", - "- Use the trained model to predict on the test data and generate model evaluation metrics.\n", - "- If no prior trained model exists, the model will be registered as a baseline model in production.\n", - "- If a production model is found, the evaluation metrics for that model will be compared against the newly trained model and if they surpass production, model will be registered to production.\n", - "\n", - "\n", - "## Requirements\n", - "This tutorial requires Databricks Runtime for Machine Learning." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "6c08ac56-142f-4c14-84ea-59d56e1c95db", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Multiple people may be running this workshop at the same time. We want each\n", - "# participant to have their own set of files. To create your own file storage area,\n", - "# put your name below:\n", - "\n", - "your_name = \"\"\n", - "\n", - "try: run_name = dbutils.widgets.get(\"run_name\")\n", - "except: run_name = your_name.strip()\n", - "run_name = \"no_name\" if run_name == \"\" else run_name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "c79dcf84-c3b4-43d7-a626-86fa258520ef", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# We need to know if this is running as part of a Continuous Integration or as part of a\n", - "# Continuous Deployment. Let's look for a flag that will tell us.\n", - "devops_action = \"\"\n", - "try: devops_action = dbutils.widgets.get(\"devops_action\")\n", - "except: devops_action = \"unknown\"\n", - "devops_action = devops_action.strip().upper()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "953f4f1f-405b-44e1-863b-600e6f2bc94f", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Load the prepared data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "4bf02fb6-5f66-4a73-a166-d51137f7509c", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "if devops_action == \"INTEGRATION\" or devops_action == \"UNKNOWN\":\n", - " data = pd.read_csv(f\"/dbfs/tutorials/wine-data/{run_name}/wine-quality-all-prepped.csv\")\n", - " data = data.drop([\"Unnamed: 0\"], axis=1)\n", - "elif devops_action == \"DEPLOYMENT\":\n", - " data = spark.read.format(\"delta\").load(\"dbfs:/tutorials/wine-data/delta\")\n", - " data = data.toPandas()\n", - "\n", - "# add in the corresponding parameter to the cd pipeline\n", - "# and parameter handling in this notebook" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "379abb9a-bd5b-46b9-8975-ce91858eba39", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Split dataset and use test dataset to measure trained model\n", - "Split the input data into 3 sets:\n", - "- Train (60% of the dataset used to train the model)\n", - "- Validation (20% of the dataset used to tune the hyperparameters)\n", - "- Test (20% of the dataset used to report the true performance of the model on an unseen dataset)\n", - "\n", - "We use the same seed as in the training notebook, and only the test dataset will be used in this model evaluation notebook." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "e70c1c07-0048-4bb8-a0c3-17a7895bb4b2", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "X = data.drop([\"quality\"], axis=1)\n", - "y = data.quality\n", - "\n", - "# Split out the training data\n", - "X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=123)\n", - "\n", - "# Split the remaining data equally into validation and test\n", - "X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "224d0ddf-3925-4849-bc20-564175c212e8", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Load the staged model\n", - "If the training notebook succeeds, it registers a model to staging. Load the model for comparison against the current production model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "46af6002-8063-4ada-b7e5-6ce56c22f60a", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import mlflow\n", - "import mlflow.pyfunc\n", - "from sklearn.metrics import roc_auc_score\n", - "from mlflow.tracking import MlflowClient\n", - "\n", - "# to create your own version of the model, uncomment the next line, and comment the line after\n", - "# model_name = f\"wine_quality-{run_name}\"\n", - "model_name = \"wine_quality\"\n", - "staged_model = mlflow.pyfunc.load_model(f\"models:/{model_name}/staging\")\n", - "\n", - "staged_model_auc = roc_auc_score(y_test, staged_model.predict(X_test))\n", - "print(f'Current staged model AUC on test data: {staged_model_auc}')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "1655b27e-19cc-40bd-82d0-cddbb95261e4", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Load the current production model (if any)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "d91af6d7-b433-4878-aa70-1de36779ee48", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "try: \n", - " production_model = mlflow.pyfunc.load_model(f\"models:/{model_name}/production\")\n", - "except:\n", - " production_model = None\n", - " print(\"No current model in production\") " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "8d1df4d8-4b50-47d8-ad89-d3c80ce960ee", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Compare staged model to production model (if exists), keep better model in production" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "9f7b7e14-b016-41c1-9852-aae0f6625e93", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "client = MlflowClient()\n", - "\n", - "def get_stage_version(model_name, stage_name):\n", - " stages = client.get_latest_versions(model_name)\n", - " version = [i.version for i in stages if i.current_stage == stage_name]\n", - " return version[0] if version else '0'\n", - "\n", - "prod_version = get_stage_version(model_name, \"Production\")\n", - "staging_version = get_stage_version(model_name, \"Staging\")\n", - "\n", - "if production_model:\n", - " prod_model_auc = roc_auc_score(y_test, production_model.predict(X_test))\n", - " print(f'Current production model AUC on test data: {prod_model_auc}')\n", - "\n", - " if staged_model_auc > prod_model_auc:\n", - " print(\"Staged model outperforms current production model.\")\n", - " print(\"Archiving old production model\")\n", - " client.transition_model_version_stage(\n", - " name=model_name,\n", - " version=prod_version,\n", - " stage=\"Archived\",\n", - " )\n", - " print(\"Promoting staging to production\")\n", - " client.transition_model_version_stage(\n", - " name=model_name,\n", - " version=staging_version,\n", - " stage=\"Production\",\n", - " )\n", - " \n", - " else:\n", - " raise Exception(\"Staged model does not outperform current prod, exiting\")\n", - " \n", - "else:\n", - " print(\"No production model found, promoting staging to production\")\n", - " client.transition_model_version_stage(\n", - " name=model_name,\n", - " version=staging_version,\n", - " stage=\"Production\",\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f284978e-9c9b-4cd2-8bdc-1393991c5701", - "showTitle": false, - "title": "" - } - }, - "source": [ - "The Models page now shows the best-performing model version in stage \"Production\".\n", - "\n", - "You can now refer to the model using the path \"models:/wine_quality/production\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "58b22211-62ea-4fdd-a6ff-c552a65fa6e9", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "model = mlflow.pyfunc.load_model(f\"models:/{model_name}/production\")\n", - "\n", - "# Sanity-check: This should match the AUC logged by MLflow\n", - "print(f'AUC: {roc_auc_score(y_test, model.predict(X_test))}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "f230ed54-368c-4acc-ac12-b18f45808af6", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "part_1_3_evaluating", - "widgets": {} - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f775ddb6-7daf-4f9f-99fa-7fc024351b47", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Part 1: Evaluating\n", + "\n", + "The Part 1 Data Prep notebook handles data preparation and quality checking steps. \n", + "\n", + "The Part 1 Model Training notebook builds a model and writes metrics to MLflow. \n", + "\n", + "This notebook will handle the following steps:\n", + "- Load the test data.\n", + "- Load the model registered to staging in the training step.\n", + "- Use the trained model to predict on the test data and generate model evaluation metrics.\n", + "- If no prior trained model exists, the model will be registered as a baseline model in production.\n", + "- If a production model is found, the evaluation metrics for that model will be compared against the newly trained model and if they surpass production, model will be registered to production.\n", + "\n", + "\n", + "## Requirements\n", + "This tutorial requires Databricks Runtime for Machine Learning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6c08ac56-142f-4c14-84ea-59d56e1c95db", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Multiple people may be running this workshop at the same time. We want each\n", + "# participant to have their own set of files. To create your own file storage area,\n", + "# put your name below:\n", + "\n", + "your_name = \"\"\n", + "\n", + "try: run_name = dbutils.widgets.get(\"run_name\")\n", + "except: run_name = your_name.strip()\n", + "run_name = \"no_name\" if run_name == \"\" else run_name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c79dcf84-c3b4-43d7-a626-86fa258520ef", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# We need to know if this is running as part of a Continuous Integration or as part of a\n", + "# Continuous Deployment. Let's look for a flag that will tell us.\n", + "devops_action = \"\"\n", + "try: devops_action = dbutils.widgets.get(\"devops_action\")\n", + "except: devops_action = \"unknown\"\n", + "devops_action = devops_action.strip().upper()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "953f4f1f-405b-44e1-863b-600e6f2bc94f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Load the prepared data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4bf02fb6-5f66-4a73-a166-d51137f7509c", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "if devops_action == \"INTEGRATION\" or devops_action == \"UNKNOWN\":\n", + " data = pd.read_csv(f\"/dbfs/tutorials/wine-data/{run_name}/wine-quality-all-prepped.csv\")\n", + " data = data.drop([\"Unnamed: 0\"], axis=1)\n", + "elif devops_action == \"DEPLOYMENT\":\n", + " data = spark.read.format(\"delta\").load(\"dbfs:/tutorials/wine-data/delta\")\n", + " data = data.toPandas()\n", + "\n", + "# add in the corresponding parameter to the cd pipeline\n", + "# and parameter handling in this notebook" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "379abb9a-bd5b-46b9-8975-ce91858eba39", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Split dataset and use test dataset to measure trained model\n", + "Split the input data into 3 sets:\n", + "- Train (60% of the dataset used to train the model)\n", + "- Validation (20% of the dataset used to tune the hyperparameters)\n", + "- Test (20% of the dataset used to report the true performance of the model on an unseen dataset)\n", + "\n", + "We use the same seed as in the training notebook, and only the test dataset will be used in this model evaluation notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e70c1c07-0048-4bb8-a0c3-17a7895bb4b2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X = data.drop([\"quality\"], axis=1)\n", + "y = data.quality\n", + "\n", + "# Split out the training data\n", + "X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=123)\n", + "\n", + "# Split the remaining data equally into validation and test\n", + "X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "224d0ddf-3925-4849-bc20-564175c212e8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Load the staged model\n", + "If the training notebook succeeds, it registers a model to staging. Load the model for comparison against the current production model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "46af6002-8063-4ada-b7e5-6ce56c22f60a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import mlflow\n", + "import mlflow.pyfunc\n", + "from sklearn.metrics import roc_auc_score\n", + "from mlflow.tracking import MlflowClient\n", + "\n", + "# to create your own version of the model, uncomment the next line, and comment the line after\n", + "# model_name = f\"wine_quality-{run_name}\"\n", + "model_name = \"wine_quality\"\n", + "staged_model = mlflow.pyfunc.load_model(f\"models:/{model_name}/staging\")\n", + "\n", + "staged_model_auc = roc_auc_score(y_test, staged_model.predict(X_test))\n", + "print(f'Current staged model AUC on test data: {staged_model_auc}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1655b27e-19cc-40bd-82d0-cddbb95261e4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Load the current production model (if any)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d91af6d7-b433-4878-aa70-1de36779ee48", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "try: \n", + " production_model = mlflow.pyfunc.load_model(f\"models:/{model_name}/production\")\n", + "except:\n", + " production_model = None\n", + " print(\"No current model in production\") " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8d1df4d8-4b50-47d8-ad89-d3c80ce960ee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Compare staged model to production model (if exists), keep better model in production" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9f7b7e14-b016-41c1-9852-aae0f6625e93", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client = MlflowClient()\n", + "\n", + "def get_stage_version(model_name, stage_name):\n", + " stages = client.get_latest_versions(model_name)\n", + " version = [i.version for i in stages if i.current_stage == stage_name]\n", + " return version[0] if version else '0'\n", + "\n", + "prod_version = get_stage_version(model_name, \"Production\")\n", + "staging_version = get_stage_version(model_name, \"Staging\")\n", + "\n", + "if production_model:\n", + " prod_model_auc = roc_auc_score(y_test, production_model.predict(X_test))\n", + " print(f'Current production model AUC on test data: {prod_model_auc}')\n", + "\n", + " if staged_model_auc > prod_model_auc:\n", + " print(\"Staged model outperforms current production model.\")\n", + " print(\"Archiving old production model\")\n", + " client.transition_model_version_stage(\n", + " name=model_name,\n", + " version=prod_version,\n", + " stage=\"Archived\",\n", + " )\n", + " print(\"Promoting staging to production\")\n", + " client.transition_model_version_stage(\n", + " name=model_name,\n", + " version=staging_version,\n", + " stage=\"Production\",\n", + " )\n", + " \n", + " else:\n", + " raise Exception(\"Staged model does not outperform current prod, exiting\")\n", + " \n", + "else:\n", + " print(\"No production model found, promoting staging to production\")\n", + " client.transition_model_version_stage(\n", + " name=model_name,\n", + " version=staging_version,\n", + " stage=\"Production\",\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f284978e-9c9b-4cd2-8bdc-1393991c5701", + "showTitle": false, + "title": "" + } + }, + "source": [ + "The Models page now shows the best-performing model version in stage \"Production\".\n", + "\n", + "You can now refer to the model using the path \"models:/wine_quality/production\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "58b22211-62ea-4fdd-a6ff-c552a65fa6e9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "model = mlflow.pyfunc.load_model(f\"models:/{model_name}/production\")\n", + "\n", + "# Sanity-check: This should match the AUC logged by MLflow\n", + "print(f'AUC: {roc_auc_score(y_test, model.predict(X_test))}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f230ed54-368c-4acc-ac12-b18f45808af6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "part_1_3_evaluating", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/MLOps-ADO-ADB/src/workshop/notebooks/part_1_4_scoring.ipynb b/MLOps-ADO-ADB/src/workshop/notebooks/part_1_4_scoring.ipynb index 5dc4b4fa..842bfeaf 100644 --- a/MLOps-ADO-ADB/src/workshop/notebooks/part_1_4_scoring.ipynb +++ b/MLOps-ADO-ADB/src/workshop/notebooks/part_1_4_scoring.ipynb @@ -1,176 +1,176 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "42da3deb-616c-47da-ba93-a9259704ce36", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Batch inference\n", - "\n", - "There are many scenarios where you might want to evaluate a model on a corpus of new data. For example, you may have a fresh batch of data, or may need to compare the performance of two models on the same corpus of data.\n", - "\n", - "The following code evaluates the model on data stored in a Delta table, using Spark to run the computation in parallel." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "44a3d2ba-1285-460e-be14-a060c06de364", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# To simulate a new corpus of data, save the existing X_train data to a Delta table. \n", - "# In the real world, this would be a new batch of data.\n", - "spark_df = spark.createDataFrame(X_train)\n", - "table_path = \"dbfs:/tutorials/wine-data/delta\"\n", - "\n", - "# Delete the contents of this path in case this cell has already been run\n", - "dbutils.fs.rm(table_path, True)\n", - "spark_df.write.format(\"delta\").save(table_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "a033adeb-5af8-403e-8c68-75dd829caedf", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Load the model into a Spark UDF, so it can be applied to the Delta table." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "d5a7251e-ef02-4eb3-8c51-52ed6faa8abf", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import mlflow.pyfunc\n", - "\n", - "apply_model_udf = mlflow.pyfunc.spark_udf(spark, f\"models:/{model_name}/production\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "7beb5d22-6dcd-4a6c-80fb-6ca88808ccfb", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Read the \"new data\" from Delta\n", - "new_data = spark.read.format(\"delta\").load(table_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "39e22812-2516-48e6-9735-21bff0fcbf29", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "display(new_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "8087a6e1-511d-4c5b-895c-34539ed004a2", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from pyspark.sql.functions import struct\n", - "\n", - "# Apply the model to the new data\n", - "udf_inputs = struct(*(X_train.columns.tolist()))\n", - "\n", - "new_data = new_data.withColumn(\n", - " \"prediction\",\n", - " apply_model_udf(udf_inputs)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "c451842f-0280-4900-9e6f-4bd425e8c139", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Each row now has an associated prediction. Note that the xgboost function does not output probabilities by default, so the predictions are not limited to the range [0, 1].\n", - "display(new_data)" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 4 - }, - "notebookName": "part_1_4_scoring", - "widgets": {} - }, - "language_info": { - "name": "python" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 0 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "42da3deb-616c-47da-ba93-a9259704ce36", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Batch inference\n", + "\n", + "There are many scenarios where you might want to evaluate a model on a corpus of new data. For example, you may have a fresh batch of data, or may need to compare the performance of two models on the same corpus of data.\n", + "\n", + "The following code evaluates the model on data stored in a Delta table, using Spark to run the computation in parallel." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "44a3d2ba-1285-460e-be14-a060c06de364", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# To simulate a new corpus of data, save the existing X_train data to a Delta table. \n", + "# In the real world, this would be a new batch of data.\n", + "spark_df = spark.createDataFrame(X_train)\n", + "table_path = \"dbfs:/tutorials/wine-data/delta\"\n", + "\n", + "# Delete the contents of this path in case this cell has already been run\n", + "dbutils.fs.rm(table_path, True)\n", + "spark_df.write.format(\"delta\").save(table_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a033adeb-5af8-403e-8c68-75dd829caedf", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Load the model into a Spark UDF, so it can be applied to the Delta table." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d5a7251e-ef02-4eb3-8c51-52ed6faa8abf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import mlflow.pyfunc\n", + "\n", + "apply_model_udf = mlflow.pyfunc.spark_udf(spark, f\"models:/{model_name}/production\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7beb5d22-6dcd-4a6c-80fb-6ca88808ccfb", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Read the \"new data\" from Delta\n", + "new_data = spark.read.format(\"delta\").load(table_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "39e22812-2516-48e6-9735-21bff0fcbf29", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "display(new_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8087a6e1-511d-4c5b-895c-34539ed004a2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import struct\n", + "\n", + "# Apply the model to the new data\n", + "udf_inputs = struct(*(X_train.columns.tolist()))\n", + "\n", + "new_data = new_data.withColumn(\n", + " \"prediction\",\n", + " apply_model_udf(udf_inputs)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c451842f-0280-4900-9e6f-4bd425e8c139", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Each row now has an associated prediction. Note that the xgboost function does not output probabilities by default, so the predictions are not limited to the range [0, 1].\n", + "display(new_data)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "part_1_4_scoring", + "widgets": {} + }, + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/MLOps-ADO-ADB/src/workshop/notebooks/part_4_new_training_code.ipynb b/MLOps-ADO-ADB/src/workshop/notebooks/part_4_new_training_code.ipynb index 3f78c232..21d016af 100644 --- a/MLOps-ADO-ADB/src/workshop/notebooks/part_4_new_training_code.ipynb +++ b/MLOps-ADO-ADB/src/workshop/notebooks/part_4_new_training_code.ipynb @@ -1,219 +1,219 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "31f31ff4-9739-47a7-9610-94cc9522f8c9", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Experiment with a new model\n", - "\n", - "The random forest model performed well even without hyperparameter tuning.\n", - "\n", - "The following code uses the xgboost library to train a more accurate model. It runs a parallel hyperparameter sweep to train multiple\n", - "models in parallel, using Hyperopt and SparkTrials. As before, the code tracks the performance of each parameter configuration with MLflow." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "022fbb0a-62fe-4fb2-9eb0-ccc06af2e583", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK\n", - "from hyperopt.pyll import scope\n", - "from math import exp\n", - "import mlflow.xgboost\n", - "import numpy as np\n", - "import xgboost as xgb\n", - "\n", - "search_space = {\n", - " 'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),\n", - " 'learning_rate': hp.loguniform('learning_rate', -3, 0),\n", - " 'reg_alpha': hp.loguniform('reg_alpha', -5, -1),\n", - " 'reg_lambda': hp.loguniform('reg_lambda', -6, -1),\n", - " 'min_child_weight': hp.loguniform('min_child_weight', -1, 3),\n", - " 'objective': 'binary:logistic',\n", - " 'seed': 123, # Set a seed for deterministic training\n", - "}\n", - "\n", - "def train_model(params):\n", - " # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.\n", - " mlflow.xgboost.autolog()\n", - " with mlflow.start_run(nested=True):\n", - " train = xgb.DMatrix(data=X_train, label=y_train)\n", - " validation = xgb.DMatrix(data=X_val, label=y_val)\n", - " # Pass in the validation set so xgb can track an evaluation metric. XGBoost terminates training when the evaluation metric\n", - " # is no longer improving.\n", - " booster = xgb.train(params=params, dtrain=train, num_boost_round=1000,\\\n", - " evals=[(validation, \"validation\")], early_stopping_rounds=50)\n", - " validation_predictions = booster.predict(validation)\n", - " auc_score = roc_auc_score(y_val, validation_predictions)\n", - " mlflow.log_metric('auc', auc_score)\n", - "\n", - " signature = infer_signature(X_train, booster.predict(train))\n", - " mlflow.xgboost.log_model(booster, \"model\", signature=signature)\n", - " \n", - " # Set the loss to -1*auc_score so fmin maximizes the auc_score\n", - " return {'status': STATUS_OK, 'loss': -1*auc_score, 'booster': booster.attributes()}\n", - "\n", - "# Greater parallelism will lead to speedups, but a less optimal hyperparameter sweep. \n", - "# A reasonable value for parallelism is the square root of max_evals.\n", - "spark_trials = SparkTrials(parallelism=10)\n", - "\n", - "# Run fmin within an MLflow run context so that each hyperparameter configuration is logged as a child run of a parent\n", - "# run called \"xgboost_models\" .\n", - "with mlflow.start_run(run_name='xgboost_models'):\n", - " best_params = fmin(\n", - " fn=train_model, \n", - " space=search_space, \n", - " algo=tpe.suggest, \n", - " max_evals=96,\n", - " trials=spark_trials,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "2991436a-2591-4436-89b2-65420c52f0e0", - "showTitle": false, - "title": "" - } - }, - "source": [ - "#### Use MLflow to view the results\n", - "Open up the Experiment Runs sidebar to see the MLflow runs. Click on Date next to the down arrow to display a menu, and select 'auc' to display the runs sorted by the auc metric. The highest auc value is 0.90.\n", - "\n", - "MLflow tracks the parameters and performance metrics of each run. Click the External Link icon at the top of the Experiment Runs sidebar to navigate to the MLflow Runs Table." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "325e2c3c-4164-4f73-9022-0b954a7ce4b7", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Now investigate how the hyperparameter choice correlates with AUC. Click the \"+\" icon to expand the parent run, then select all runs except the parent, and click \"Compare\". Select the Parallel Coordinates Plot.\n", - "\n", - "The Parallel Coordinates Plot is useful in understanding the impact of parameters on a metric. You can drag the pink slider bar at the upper right corner of the plot to highlight a subset of AUC values and the corresponding parameter values. The plot below highlights the highest AUC values:\n", - "\n", - "\n", - "\n", - "Notice that all of the top performing runs have a low value for reg_lambda and learning_rate. \n", - "\n", - "You could run another hyperparameter sweep to explore even lower values for these parameters. For simplicity, that step is not included in this example." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "773d95c7-4df6-47e3-b897-26e01a0cf1a6", - "showTitle": false, - "title": "" - } - }, - "source": [ - "You used MLflow to log the model produced by each hyperparameter configuration. The following code finds the best performing run and saves the model to Model Registry." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "ead8b6cb-bd7e-447a-8271-c283b0dcba63", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "best_run = mlflow.search_runs(order_by=['metrics.auc DESC']).iloc[0]\n", - "print(f'AUC of Best Run: {best_run[\"metrics.auc\"]}')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "4dbc4e92-20de-44f5-96b8-506c413d0268", - "showTitle": false, - "title": "" - } - }, - "source": [ - "#### Update the production `wine_quality` model in MLflow Model Registry\n", - "\n", - "Earlier, you saved the baseline model to Model Registry with the name `wine_quality`. Now that you have a created a more accurate model, update `wine_quality`." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "d0b67f77-31cd-405e-9d28-f3519b517b3f", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "new_model_version = mlflow.register_model(f\"runs:/{best_run.run_id}/model\", model_name)\n", - "\n", - "# Registering the model takes a few seconds, so add a small delay\n", - "time.sleep(15)" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 4 - }, - "notebookName": "part_4_new_training_code", - "widgets": {} - }, - "language_info": { - "name": "python" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 0 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "31f31ff4-9739-47a7-9610-94cc9522f8c9", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Experiment with a new model\n", + "\n", + "The random forest model performed well even without hyperparameter tuning.\n", + "\n", + "The following code uses the xgboost library to train a more accurate model. It runs a parallel hyperparameter sweep to train multiple\n", + "models in parallel, using Hyperopt and SparkTrials. As before, the code tracks the performance of each parameter configuration with MLflow." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "022fbb0a-62fe-4fb2-9eb0-ccc06af2e583", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK\n", + "from hyperopt.pyll import scope\n", + "from math import exp\n", + "import mlflow.xgboost\n", + "import numpy as np\n", + "import xgboost as xgb\n", + "\n", + "search_space = {\n", + " 'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),\n", + " 'learning_rate': hp.loguniform('learning_rate', -3, 0),\n", + " 'reg_alpha': hp.loguniform('reg_alpha', -5, -1),\n", + " 'reg_lambda': hp.loguniform('reg_lambda', -6, -1),\n", + " 'min_child_weight': hp.loguniform('min_child_weight', -1, 3),\n", + " 'objective': 'binary:logistic',\n", + " 'seed': 123, # Set a seed for deterministic training\n", + "}\n", + "\n", + "def train_model(params):\n", + " # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.\n", + " mlflow.xgboost.autolog()\n", + " with mlflow.start_run(nested=True):\n", + " train = xgb.DMatrix(data=X_train, label=y_train)\n", + " validation = xgb.DMatrix(data=X_val, label=y_val)\n", + " # Pass in the validation set so xgb can track an evaluation metric. XGBoost terminates training when the evaluation metric\n", + " # is no longer improving.\n", + " booster = xgb.train(params=params, dtrain=train, num_boost_round=1000,\\\n", + " evals=[(validation, \"validation\")], early_stopping_rounds=50)\n", + " validation_predictions = booster.predict(validation)\n", + " auc_score = roc_auc_score(y_val, validation_predictions)\n", + " mlflow.log_metric('auc', auc_score)\n", + "\n", + " signature = infer_signature(X_train, booster.predict(train))\n", + " mlflow.xgboost.log_model(booster, \"model\", signature=signature)\n", + " \n", + " # Set the loss to -1*auc_score so fmin maximizes the auc_score\n", + " return {'status': STATUS_OK, 'loss': -1*auc_score, 'booster': booster.attributes()}\n", + "\n", + "# Greater parallelism will lead to speedups, but a less optimal hyperparameter sweep. \n", + "# A reasonable value for parallelism is the square root of max_evals.\n", + "spark_trials = SparkTrials(parallelism=10)\n", + "\n", + "# Run fmin within an MLflow run context so that each hyperparameter configuration is logged as a child run of a parent\n", + "# run called \"xgboost_models\" .\n", + "with mlflow.start_run(run_name='xgboost_models'):\n", + " best_params = fmin(\n", + " fn=train_model, \n", + " space=search_space, \n", + " algo=tpe.suggest, \n", + " max_evals=96,\n", + " trials=spark_trials,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2991436a-2591-4436-89b2-65420c52f0e0", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Use MLflow to view the results\n", + "Open up the Experiment Runs sidebar to see the MLflow runs. Click on Date next to the down arrow to display a menu, and select 'auc' to display the runs sorted by the auc metric. The highest auc value is 0.90.\n", + "\n", + "MLflow tracks the parameters and performance metrics of each run. Click the External Link icon at the top of the Experiment Runs sidebar to navigate to the MLflow Runs Table." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "325e2c3c-4164-4f73-9022-0b954a7ce4b7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Now investigate how the hyperparameter choice correlates with AUC. Click the \"+\" icon to expand the parent run, then select all runs except the parent, and click \"Compare\". Select the Parallel Coordinates Plot.\n", + "\n", + "The Parallel Coordinates Plot is useful in understanding the impact of parameters on a metric. You can drag the pink slider bar at the upper right corner of the plot to highlight a subset of AUC values and the corresponding parameter values. The plot below highlights the highest AUC values:\n", + "\n", + "\n", + "\n", + "Notice that all of the top performing runs have a low value for reg_lambda and learning_rate. \n", + "\n", + "You could run another hyperparameter sweep to explore even lower values for these parameters. For simplicity, that step is not included in this example." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "773d95c7-4df6-47e3-b897-26e01a0cf1a6", + "showTitle": false, + "title": "" + } + }, + "source": [ + "You used MLflow to log the model produced by each hyperparameter configuration. The following code finds the best performing run and saves the model to Model Registry." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ead8b6cb-bd7e-447a-8271-c283b0dcba63", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "best_run = mlflow.search_runs(order_by=['metrics.auc DESC']).iloc[0]\n", + "print(f'AUC of Best Run: {best_run[\"metrics.auc\"]}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4dbc4e92-20de-44f5-96b8-506c413d0268", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Update the production `wine_quality` model in MLflow Model Registry\n", + "\n", + "Earlier, you saved the baseline model to Model Registry with the name `wine_quality`. Now that you have a created a more accurate model, update `wine_quality`." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d0b67f77-31cd-405e-9d28-f3519b517b3f", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "new_model_version = mlflow.register_model(f\"runs:/{best_run.run_id}/model\", model_name)\n", + "\n", + "# Registering the model takes a few seconds, so add a small delay\n", + "time.sleep(15)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "part_4_new_training_code", + "widgets": {} + }, + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/README.md b/README.md index 5684133e..1b1c98b7 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,31 @@ -# MLOps Template -MLOps Template is a repo created by Microsoft field personnel (CSA, GBB, MTC) that provides several tools and templates to facilitate modern MLOps practices. - -In addition to a template for an Active Learning implementation, there are also two sets of materials to facilitate an introductory workshop on modern MLOps practices, one, developed by the West Region CSU, using Azure Machine Learning and GitHub Actions, and another featuring Azure Databricks for ML asset development and Azure DevOps for CI/CD pipelines. - -- [Active Learning template](/src/active_learning_cv/) -- [MLOps workshop materials using Azure Machine Learning and GitHub Actions](/src/workshop/) -- [MLOps workshop materials using Azure Databricks and Azure DevOps](MLOps-ADO-ADB/src/workshop/) - - -## Contributing - -This project welcomes contributions and suggestions. Most contributions require you to agree to a -Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us -the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. - -When you submit a pull request, a CLA bot will automatically determine whether you need to provide -a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions -provided by the bot. You will only need to do this once across all repos using our CLA. - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). -For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or -contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. - -## Trademarks - -This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft -trademarks or logos is subject to and must follow -[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). -Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. -Any use of third-party trademarks or logos are subject to those third-party's policies. +# MLOps Template +MLOps Template is a repo created by Microsoft field personnel (CSA, GBB, MTC) that provides several tools and templates to facilitate modern MLOps practices. + +In addition to a template for an Active Learning implementation, there are also two sets of materials to facilitate an introductory workshop on modern MLOps practices, one, developed by the West Region CSU, using Azure Machine Learning and GitHub Actions, and another featuring Azure Databricks for ML asset development and Azure DevOps for CI/CD pipelines. + +- [Active Learning template](/src/active_learning_cv/) +- [MLOps workshop materials using Azure Machine Learning and GitHub Actions](/src/workshop/) +- [MLOps workshop materials using Azure Databricks and Azure DevOps](MLOps-ADO-ADB/src/workshop/) + + +## Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + +## Trademarks + +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow +[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). +Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. +Any use of third-party trademarks or logos are subject to those third-party's policies. diff --git a/SECURITY.md b/SECURITY.md index f7b89984..12fbd833 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,41 +1,41 @@ - - -## Security - -Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). - -If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. - -## Reporting Security Issues - -**Please do not report security vulnerabilities through public GitHub issues.** - -Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). - -If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). - -You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). - -Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: - - * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) - * Full paths of source file(s) related to the manifestation of the issue - * The location of the affected source code (tag/branch/commit or direct URL) - * Any special configuration required to reproduce the issue - * Step-by-step instructions to reproduce the issue - * Proof-of-concept or exploit code (if possible) - * Impact of the issue, including how an attacker might exploit the issue - -This information will help us triage your report more quickly. - -If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. - -## Preferred Languages - -We prefer all communications to be in English. - -## Policy - -Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). - + + +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** + +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). + +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). + \ No newline at end of file diff --git a/src/.amlignore b/src/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/.amlignore.amltmp b/src/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/active_learning_cv/.amlignore b/src/active_learning_cv/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/active_learning_cv/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/active_learning_cv/.amlignore.amltmp b/src/active_learning_cv/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/active_learning_cv/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/active_learning_cv/README.md b/src/active_learning_cv/README.md index 9ccbb2bd..fe58abd6 100644 --- a/src/active_learning_cv/README.md +++ b/src/active_learning_cv/README.md @@ -1,2 +1,2 @@ - -# [Please access the repo of Active Learning Here](https://github.com/microsoft/MLOpsTemplate/tree/james-simdev/src/active_learning_cv) + +# [Please access the repo of Active Learning Here](https://github.com/microsoft/MLOpsTemplate/tree/james-simdev/src/active_learning_cv) diff --git a/src/workshop/.amlignore b/src/workshop/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/.amlignore.amltmp b/src/workshop/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/README.md b/src/workshop/README.md index fcdfc575..351bf569 100644 --- a/src/workshop/README.md +++ b/src/workshop/README.md @@ -1,114 +1,114 @@ -# MLOps Workshop - -## Introduction -The MLOps workshop is an instructor-led workshop that provides guidance on an MLOps -implementation in Azure. This workshop leverages [Azure Machine -Learning](https://azure.microsoft.com/en-us/services/machine-learning/?msclkid=99faf4b9b43f11ec8a3dc121747bf2a7) -and [Github -Actions](https://docs.microsoft.com/en-us/azure/developer/github/github-actions?msclkid=a9587556b43f11ecb200fd14b82d03f0) -to implement a robust set of workflows to support machine learning models in production. - -The core business problem revolves around predicting taxi fares in New York. This is based on an [Azure Open -Dataset](https://azure.microsoft.com/en-us/services/open-datasets/#overview) sourced from -[here](https://docs.microsoft.com/en-us/azure/open-datasets/dataset-taxi-green?tabs=azureml-opendatasets). The -need to predict numerical values is a regression problem that is a common need for many enterprises across -data sets in their organizations. For the purpose of this workshop, the key stages of exploring the data, -engineering predictive features (data engineering) and model building (training, hyperparameter tuning, -algorithm selection, etc.) will be assumed to be done and already codified in this [Jupyter -notebook](https://github.com/microsoft/MLOpsTemplate/blob/thomassantosh-dev/src/workshop/notebooks/taxi-tutorial.ipynb). -The core focus of the workshop will then be how to productionalize this code, lay the DevOps foundation, and -support the best model in production. - -## Audience -- Customer data scientists -- ML engineers -- ML platform architects and managers -- ... and any other roles that require hands-on experience to support ML models in Azure - -## Goals -- Understand key elements of modern MLOps and how it helps improve and accelerate ML practices. -- Design experiments, deployment environments and MLOps pipelines in Azure Machine Learning. -- Get hands-on experience in building continuous integration and continuous deployment pipelines with new Azure ML vNext and Github Actions. - -## Structure -- [Pre-Workshop Checklist](documents/part_tips.md) -- [Part 0: MLOps overview and environment setup](documents/part_0.md) -- [Part 1: Structure code for fast iterative development](documents/part_1.md) -- [Part 2: Use cloud scale compute and monitor experiment with Azure ML](documents/part_2.md) -- [Part 3: Use github for version control and automation](documents/part_3.md) -- [Part 4: Continuous integration (CI)](documents/part_4.md) -- [Part 5: Continuous deployment (CD)](documents/part_5.md) -- Part 6: Observability  - -## Repo Structure -> Note: This is the repository file structure from the repository root. -- `README.md` > Core README for the repository -- `.github` - - `actions` > YAML files for Github Actions relating to AML job creation, and endpoint deployment - - `workflows` > YAML files for Github Actions relating to unit tests and CI/CD workflows -- `src` - - `workshop` - - `README.md` > README file highlighting the workshop goals, steps and key audience - - `conda-local.yml` > Third-party python dependencies for managing the conda virtual environment - - `core` - - ``data_engineering`` > Python and YAML files to support feature engineering - - ``evaluating`` > Python and YAML files to support model evaluation based on specific model metrics - - ``pipelines`` > YAML files to support creation of ML pipelines - - ``scoring`` > Python and YAML files to support model deployment and scoring - - ``training`` > Python and YAML files to support model training - - ``data`` > Base datasets in parquet, with a Python file to load the data into the default datastore - - ``documents`` > Setup scripts, and markdown files to support a hands-on workshop - - ``infra`` > Setup scripts to support initial creation of the Azure Machine Learning infrastructure and resources - - ``notebooks`` > Jupyter notebook containing all the code related to data exploration, cleansing, feature engineering and model - creation - -## Workshop Scenario -> **Note: Read before starting the workshop!** - -Your team has been working on a new machine learning (ML) problem (predicting taxi fares in New York). The -team has been performing exploratory work on the data and has come to a state where the model is solidified. -Now, it is time to put a structure into the work so that the team can iterate faster toward building a fully -functional solution. So far, team members have been working mostly with Jupyter notebooks on their -personal compute. - -To re-engineer this into a functional MLOps process, the following steps will be taken: -1. The code will be modularized (refactored into separate python modules) and parameterized (configured so it - can be re-run with different values). This will lay the foundation for good software practices and allow - multiple data scientists/engineers to work collaboratively on the code. (Later, we will reinforce DevOps - practices around continuous integration and continuous deployment with specific workflows to support model - training and evaluation. MLOps builds off a strong foundation in DevOps and looks to additionally manage - the model and data lifecycles to support the best model in production.) -2. After successfully restructuring the Jupyter notebook and running the modules locally, your team will - leverage Microsoft Azure to run the ML experiment at scale. They will take advantage of experiment tracking - and model management capabilities in Azure ML to keep track of experiments. The team will then deploy the - model as a rest endpoint for real time inferencing. -4. They will then setup a centralized version control in Github to keep track of project code and manage different - feature development tracks and releases. They will need to understand how to automate and orchestrate - common tasks such as environment setup, training, and testing. -5. After setting up GitHub for MLOps, your team will start automating the model training and evaluation - process with a Continuous Integration (CI) pipeline. -6. After a successful run of the CI pipeline, your team will complete the process with a Continuous - Delivery (CD) pipeline that will handle the deployment of the model without introducing any downtime in - production (hot swap). -7. Now, head to [Workshop Environment Setup: Part 0](https://github.com/microsoft/MLOpsTemplate/blob/main/src/workshop/documents/part_0.md#part-0-workshop-environment-setup) - - -## Contributing -This project welcomes contributions and suggestions. Most contributions require you to agree to a -Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us -the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. - -When you submit a pull request, a CLA bot will automatically determine whether you need to provide -a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions -provided by the bot. You will only need to do this once across all repos using our CLA. - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). -For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or -contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. - -## Trademarks -This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft -trademarks or logos is subject to and must follow -[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). -Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. -Any use of third-party trademarks or logos are subject to those third-party's policies. +# MLOps Workshop + +## Introduction +The MLOps workshop is an instructor-led workshop that provides guidance on an MLOps +implementation in Azure. This workshop leverages [Azure Machine +Learning](https://azure.microsoft.com/en-us/services/machine-learning/?msclkid=99faf4b9b43f11ec8a3dc121747bf2a7) +and [Github +Actions](https://docs.microsoft.com/en-us/azure/developer/github/github-actions?msclkid=a9587556b43f11ecb200fd14b82d03f0) +to implement a robust set of workflows to support machine learning models in production. + +The core business problem revolves around predicting taxi fares in New York. This is based on an [Azure Open +Dataset](https://azure.microsoft.com/en-us/services/open-datasets/#overview) sourced from +[here](https://docs.microsoft.com/en-us/azure/open-datasets/dataset-taxi-green?tabs=azureml-opendatasets). The +need to predict numerical values is a regression problem that is a common need for many enterprises across +data sets in their organizations. For the purpose of this workshop, the key stages of exploring the data, +engineering predictive features (data engineering) and model building (training, hyperparameter tuning, +algorithm selection, etc.) will be assumed to be done and already codified in this [Jupyter +notebook](https://github.com/microsoft/MLOpsTemplate/blob/thomassantosh-dev/src/workshop/notebooks/taxi-tutorial.ipynb). +The core focus of the workshop will then be how to productionalize this code, lay the DevOps foundation, and +support the best model in production. + +## Audience +- Customer data scientists +- ML engineers +- ML platform architects and managers +- ... and any other roles that require hands-on experience to support ML models in Azure + +## Goals +- Understand key elements of modern MLOps and how it helps improve and accelerate ML practices. +- Design experiments, deployment environments and MLOps pipelines in Azure Machine Learning. +- Get hands-on experience in building continuous integration and continuous deployment pipelines with new Azure ML vNext and Github Actions. + +## Structure +- [Pre-Workshop Checklist](documents/part_tips.md) +- [Part 0: MLOps overview and environment setup](documents/part_0.md) +- [Part 1: Structure code for fast iterative development](documents/part_1.md) +- [Part 2: Use cloud scale compute and monitor experiment with Azure ML](documents/part_2.md) +- [Part 3: Use github for version control and automation](documents/part_3.md) +- [Part 4: Continuous integration (CI)](documents/part_4.md) +- [Part 5: Continuous deployment (CD)](documents/part_5.md) +- Part 6: Observability  + +## Repo Structure +> Note: This is the repository file structure from the repository root. +- `README.md` > Core README for the repository +- `.github` + - `actions` > YAML files for Github Actions relating to AML job creation, and endpoint deployment + - `workflows` > YAML files for Github Actions relating to unit tests and CI/CD workflows +- `src` + - `workshop` + - `README.md` > README file highlighting the workshop goals, steps and key audience + - `conda-local.yml` > Third-party python dependencies for managing the conda virtual environment + - `core` + - ``data_engineering`` > Python and YAML files to support feature engineering + - ``evaluating`` > Python and YAML files to support model evaluation based on specific model metrics + - ``pipelines`` > YAML files to support creation of ML pipelines + - ``scoring`` > Python and YAML files to support model deployment and scoring + - ``training`` > Python and YAML files to support model training + - ``data`` > Base datasets in parquet, with a Python file to load the data into the default datastore + - ``documents`` > Setup scripts, and markdown files to support a hands-on workshop + - ``infra`` > Setup scripts to support initial creation of the Azure Machine Learning infrastructure and resources + - ``notebooks`` > Jupyter notebook containing all the code related to data exploration, cleansing, feature engineering and model + creation + +## Workshop Scenario +> **Note: Read before starting the workshop!** + +Your team has been working on a new machine learning (ML) problem (predicting taxi fares in New York). The +team has been performing exploratory work on the data and has come to a state where the model is solidified. +Now, it is time to put a structure into the work so that the team can iterate faster toward building a fully +functional solution. So far, team members have been working mostly with Jupyter notebooks on their +personal compute. + +To re-engineer this into a functional MLOps process, the following steps will be taken: +1. The code will be modularized (refactored into separate python modules) and parameterized (configured so it + can be re-run with different values). This will lay the foundation for good software practices and allow + multiple data scientists/engineers to work collaboratively on the code. (Later, we will reinforce DevOps + practices around continuous integration and continuous deployment with specific workflows to support model + training and evaluation. MLOps builds off a strong foundation in DevOps and looks to additionally manage + the model and data lifecycles to support the best model in production.) +2. After successfully restructuring the Jupyter notebook and running the modules locally, your team will + leverage Microsoft Azure to run the ML experiment at scale. They will take advantage of experiment tracking + and model management capabilities in Azure ML to keep track of experiments. The team will then deploy the + model as a rest endpoint for real time inferencing. +4. They will then setup a centralized version control in Github to keep track of project code and manage different + feature development tracks and releases. They will need to understand how to automate and orchestrate + common tasks such as environment setup, training, and testing. +5. After setting up GitHub for MLOps, your team will start automating the model training and evaluation + process with a Continuous Integration (CI) pipeline. +6. After a successful run of the CI pipeline, your team will complete the process with a Continuous + Delivery (CD) pipeline that will handle the deployment of the model without introducing any downtime in + production (hot swap). +7. Now, head to [Workshop Environment Setup: Part 0](https://github.com/microsoft/MLOpsTemplate/blob/main/src/workshop/documents/part_0.md#part-0-workshop-environment-setup) + + +## Contributing +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + +## Trademarks +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow +[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). +Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. +Any use of third-party trademarks or logos are subject to those third-party's policies. diff --git a/src/workshop/conda-local.yml b/src/workshop/conda-local.yml index 2a7f4dc2..264c0f19 100644 --- a/src/workshop/conda-local.yml +++ b/src/workshop/conda-local.yml @@ -1,12 +1,12 @@ -name: mlops-workshop-local -channels: - - conda-forge -dependencies: - - python=3.8 - - pip=21.3.1 - - pip: - - azureml-sdk==1.38.0 - - azureml-mlflow==1.38.0 - - azureml-opendatasets==1.38.0 - - pandas==1.3.5 - - scikit-learn==1.0.2 \ No newline at end of file +name: mlops-workshop-local +channels: + - conda-forge +dependencies: + - python=3.10 + - pip + - pip: + - azureml-sdk + - azureml-mlflow + - azureml-opendatasets + - pandas + - scikit-learn \ No newline at end of file diff --git a/src/workshop/core/data_engineering/.amlignore b/src/workshop/core/data_engineering/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/core/data_engineering/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/data_engineering/.amlignore.amltmp b/src/workshop/core/data_engineering/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/core/data_engineering/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/data_engineering/conda_feature_engineering.yml b/src/workshop/core/data_engineering/conda_feature_engineering.yml index d56bf7cc..f1ede346 100644 --- a/src/workshop/core/data_engineering/conda_feature_engineering.yml +++ b/src/workshop/core/data_engineering/conda_feature_engineering.yml @@ -1,11 +1,11 @@ -name: data-engineering -channels: - - conda-forge -dependencies: - - python=3.8 - - pip=21.3.1 - - pip: - - azureml-mlflow==1.38.0 - - azureml-opendatasets==1.38.0 - - pandas==1.3.5 +name: data-engineering +channels: + - conda-forge +dependencies: + - python=3.8 + - pip=21.3.1 + - pip: + - azureml-mlflow==1.38.0 + - azureml-opendatasets==1.38.0 + - pandas==1.3.5 - scikit-learn==1.0.2 \ No newline at end of file diff --git a/src/workshop/core/data_engineering/feature_engineering.py b/src/workshop/core/data_engineering/feature_engineering.py index f920f46b..ea243918 100644 --- a/src/workshop/core/data_engineering/feature_engineering.py +++ b/src/workshop/core/data_engineering/feature_engineering.py @@ -1,110 +1,110 @@ -import pandas as pd -import numpy as np -from datetime import datetime -from dateutil.relativedelta import relativedelta -import argparse -import sys -import os -from sklearn.model_selection import train_test_split -sys.path.append(os.path.join(os.path.dirname(__file__),'../../')) -def parse_args(): - # setup arg parser - parser = argparse.ArgumentParser() - - - # add arguments - parser.add_argument("--nyc_file_name", type=str, default="green_taxi.parquet") - parser.add_argument("--public_holiday_file_name", type=str, default="holidays.parquet") - parser.add_argument("--weather_file_name", type=str, default="weather.parquet") - parser.add_argument("--prep_data", type=str,default="data", help="Path of prepped data") - parser.add_argument("--input_folder", type=str, default="data") - parser.add_argument("--run_mode", type=str, default="local") - - # parse args - args = parser.parse_args() - - # return args - return args - - -def build_time_features(vector): - pickup_datetime = vector[0] - month_num = pickup_datetime.month - day_of_month = pickup_datetime.day - day_of_week = pickup_datetime.weekday() - hour_of_day = pickup_datetime.hour - country_code = "US" - hr_sin = np.sin(hour_of_day*(2.*np.pi/24)) - hr_cos = np.cos(hour_of_day*(2.*np.pi/24)) - dy_sin = np.sin(day_of_week*(2.*np.pi/7)) - dy_cos = np.cos(day_of_week*(2.*np.pi/7)) - - return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos)) - -def main(args): - - # read in data - - green_taxi_df = pd.read_parquet(os.path.join(args.input_folder, args.nyc_file_name)) - - green_taxi_df[["month_num", "day_of_month","day_of_week", "hour_of_day", "country_code", "hr_sin", "hr_cos", "dy_sin", "dy_cos"]] = \ - green_taxi_df[["lpepPickupDatetime"]].apply(build_time_features, axis=1) - - columns_to_remove = ["lpepDropoffDatetime", "puLocationId", "doLocationId", "extra", "mtaTax", - "improvementSurcharge", "tollsAmount", "ehailFee", "tripType", "rateCodeID", - "storeAndFwdFlag", "paymentType", "fareAmount", "tipAmount"] - - green_taxi_df.drop(columns_to_remove, axis=1, inplace=True) - - - green_taxi_df["datetime"] = green_taxi_df["lpepPickupDatetime"].dt.normalize() - - - holidays_df = pd.read_parquet(os.path.join(args.input_folder, args.public_holiday_file_name)) - - holidays_df = holidays_df.rename(columns={"countryRegionCode": "country_code"}) - holidays_df["datetime"] = holidays_df["date"].dt.normalize() - - holidays_df.drop(["countryOrRegion", "holidayName", "date"], axis=1, inplace=True) - - taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how="left", on=["datetime", "country_code"]) - taxi_holidays_df[taxi_holidays_df["normalizeHolidayName"].notnull()] - - - weather_df = pd.read_parquet(os.path.join(args.input_folder,args.weather_file_name)) - - weather_df["datetime"] = weather_df["datetime"].dt.normalize() - - # group by datetime - aggregations = {"precipTime": "max", "temperature": "mean", "precipDepth": "max"} - weather_df_grouped = weather_df.groupby("datetime").agg(aggregations) - weather_df_grouped.head(10) - - taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how="left", on=["datetime"]) - taxi_holidays_weather_df.describe() - - final_df = taxi_holidays_weather_df.query("pickupLatitude>=40.53 and pickupLatitude<=40.88 and \ - pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \ - tripDistance>0 and tripDistance<75 and \ - passengerCount>0 and passengerCount<100 and \ - totalAmount>0") - final_df, test_df = train_test_split(final_df, test_size=0.2, random_state=100) - os.makedirs(args.prep_data, exist_ok=True) - - if args.run_mode =='local': - print("Data Files were written successfully to folder:", args.prep_data) - - if args.run_mode =='remote': - print("Data Files were written successfully to AZML Default Data Store folder") - - final_df.to_parquet(os.path.join(args.prep_data, "final_df.parquet")) - test_df.to_parquet(os.path.join(args.prep_data, "test_df.parquet")) - - -# run script -if __name__ == "__main__": - # parse args - args = parse_args() - - # run main function - main(args) +import pandas as pd +import numpy as np +from datetime import datetime +from dateutil.relativedelta import relativedelta +import argparse +import sys +import os +from sklearn.model_selection import train_test_split +sys.path.append(os.path.join(os.path.dirname(__file__),'../../')) +def parse_args(): + # setup arg parser + parser = argparse.ArgumentParser() + + + # add arguments + parser.add_argument("--nyc_file_name", type=str, default="green_taxi.parquet") + parser.add_argument("--public_holiday_file_name", type=str, default="holidays.parquet") + parser.add_argument("--weather_file_name", type=str, default="weather.parquet") + parser.add_argument("--prep_data", type=str,default="data", help="Path of prepped data") + parser.add_argument("--input_folder", type=str, default="data") + parser.add_argument("--run_mode", type=str, default="local") + + # parse args + args = parser.parse_args() + + # return args + return args + + +def build_time_features(vector): + pickup_datetime = vector[0] + month_num = pickup_datetime.month + day_of_month = pickup_datetime.day + day_of_week = pickup_datetime.weekday() + hour_of_day = pickup_datetime.hour + country_code = "US" + hr_sin = np.sin(hour_of_day*(2.*np.pi/24)) + hr_cos = np.cos(hour_of_day*(2.*np.pi/24)) + dy_sin = np.sin(day_of_week*(2.*np.pi/7)) + dy_cos = np.cos(day_of_week*(2.*np.pi/7)) + + return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos)) + +def main(args): + + # read in data + + green_taxi_df = pd.read_parquet(os.path.join(args.input_folder, args.nyc_file_name)) + + green_taxi_df[["month_num", "day_of_month","day_of_week", "hour_of_day", "country_code", "hr_sin", "hr_cos", "dy_sin", "dy_cos"]] = \ + green_taxi_df[["lpepPickupDatetime"]].apply(build_time_features, axis=1) + + columns_to_remove = ["lpepDropoffDatetime", "puLocationId", "doLocationId", "extra", "mtaTax", + "improvementSurcharge", "tollsAmount", "ehailFee", "tripType", "rateCodeID", + "storeAndFwdFlag", "paymentType", "fareAmount", "tipAmount"] + + green_taxi_df.drop(columns_to_remove, axis=1, inplace=True) + + + green_taxi_df["datetime"] = green_taxi_df["lpepPickupDatetime"].dt.normalize() + + + holidays_df = pd.read_parquet(os.path.join(args.input_folder, args.public_holiday_file_name)) + + holidays_df = holidays_df.rename(columns={"countryRegionCode": "country_code"}) + holidays_df["datetime"] = holidays_df["date"].dt.normalize() + + holidays_df.drop(["countryOrRegion", "holidayName", "date"], axis=1, inplace=True) + + taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how="left", on=["datetime", "country_code"]) + taxi_holidays_df[taxi_holidays_df["normalizeHolidayName"].notnull()] + + + weather_df = pd.read_parquet(os.path.join(args.input_folder,args.weather_file_name)) + + weather_df["datetime"] = weather_df["datetime"].dt.normalize() + + # group by datetime + aggregations = {"precipTime": "max", "temperature": "mean", "precipDepth": "max"} + weather_df_grouped = weather_df.groupby("datetime").agg(aggregations) + weather_df_grouped.head(10) + + taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how="left", on=["datetime"]) + taxi_holidays_weather_df.describe() + + final_df = taxi_holidays_weather_df.query("pickupLatitude>=40.53 and pickupLatitude<=40.88 and \ + pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \ + tripDistance>0 and tripDistance<75 and \ + passengerCount>0 and passengerCount<100 and \ + totalAmount>0") + final_df, test_df = train_test_split(final_df, test_size=0.2, random_state=100) + os.makedirs(args.prep_data, exist_ok=True) + + if args.run_mode =='local': + print("Data Files were written successfully to folder:", args.prep_data) + + if args.run_mode =='remote': + print("Data Files were written successfully to AZML Default Data Store folder") + + final_df.to_parquet(os.path.join(args.prep_data, "final_df.parquet")) + test_df.to_parquet(os.path.join(args.prep_data, "test_df.parquet")) + + +# run script +if __name__ == "__main__": + # parse args + args = parse_args() + + # run main function + main(args) diff --git a/src/workshop/core/data_engineering/feature_engineering.yml b/src/workshop/core/data_engineering/feature_engineering.yml index e60dd5f3..ebc38150 100644 --- a/src/workshop/core/data_engineering/feature_engineering.yml +++ b/src/workshop/core/data_engineering/feature_engineering.yml @@ -1,28 +1,28 @@ -$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json -code: ./ -command: >- - python feature_engineering.py - --input_folder ${{inputs.input_folder}} - --prep_data ${{outputs.prep_data}} - --run_mode ${{inputs.run_mode}} - -inputs: - input_folder: - type: uri_folder - path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ - run_mode: "remote" - -outputs: - prep_data: - type: uri_folder - path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ - - -environment: - conda_file: ./conda_feature_engineering.yml - image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest - -compute: azureml:cpu-cluster -display_name: feature-engineering -experiment_name: feature-engineering -description: feature engineering +$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json +code: ./ +command: >- + python feature_engineering.py + --input_folder ${{inputs.input_folder}} + --prep_data ${{outputs.prep_data}} + --run_mode ${{inputs.run_mode}} + +inputs: + input_folder: + type: uri_folder + path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ + run_mode: "remote" + +outputs: + prep_data: + type: uri_folder + path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ + + +environment: + conda_file: ./conda_feature_engineering.yml + image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest + +compute: azureml:cpu-cluster +display_name: feature-engineering +experiment_name: feature-engineering +description: feature engineering diff --git a/src/workshop/core/evaluating/.amlignore b/src/workshop/core/evaluating/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/core/evaluating/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/evaluating/.amlignore.amltmp b/src/workshop/core/evaluating/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/core/evaluating/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/evaluating/conda_ml_evaluating.yml b/src/workshop/core/evaluating/conda_ml_evaluating.yml index c19cc217..f5cd52ab 100644 --- a/src/workshop/core/evaluating/conda_ml_evaluating.yml +++ b/src/workshop/core/evaluating/conda_ml_evaluating.yml @@ -1,11 +1,11 @@ -name: evaluating -channels: - - conda-forge -dependencies: - - python=3.8 - - pip=21.3.1 - - pip: - - azureml-sdk==1.38.0 - - azureml-mlflow==1.38.0 - - pandas==1.3.5 +name: evaluating +channels: + - conda-forge +dependencies: + - python=3.8 + - pip=21.3.1 + - pip: + - azureml-sdk==1.38.0 + - azureml-mlflow==1.38.0 + - pandas==1.3.5 - scikit-learn==1.0.2 \ No newline at end of file diff --git a/src/workshop/core/evaluating/ml_evaluating.py b/src/workshop/core/evaluating/ml_evaluating.py index 6e63077b..81f4cb1f 100644 --- a/src/workshop/core/evaluating/ml_evaluating.py +++ b/src/workshop/core/evaluating/ml_evaluating.py @@ -1,126 +1,126 @@ -import pandas as pd -import numpy as np -import os -import argparse -from azureml.core import Run, Dataset,Datastore, Workspace -from sklearn.linear_model import LinearRegression -from sklearn.ensemble import RandomForestRegressor -from sklearn.model_selection import train_test_split -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import OneHotEncoder -from sklearn.impute import SimpleImputer -from sklearn.compose import ColumnTransformer -from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error -import joblib -from azureml.core import Model -import mlflow -def parse_args(): - # setup arg parser - parser = argparse.ArgumentParser() - - parser.add_argument("--input_file_name", type=str, default="test_df.parquet") - parser.add_argument("--prep_data", default="data", type=str, help="Path to prepped data") - parser.add_argument("--model_folder", default="data", type=str, help="Path to model data") - parser.add_argument("--model_name",default='nyc_fare_prediction',type=str, help="Name of the model in workspace") - parser.add_argument("--run_mode", type=str, default="local") - - - # parse args - args = parser.parse_args() - - # return args - return args - - - -def main(args): - if args.run_mode =='remote': - run = Run.get_context() - ws = run.experiment.workspace - run_id = run.id - - # read in data - test_df = pd.read_parquet(os.path.join(args.prep_data,args.input_file_name)) - - catg_cols = ["vendorID", "month_num", "day_of_month", "normalizeHolidayName", "isPaidTimeOff"] - # num_cols = ["passengerCount", "tripDistance", "precipTime", "temperature", "precipDepth", "hr_sin", "hr_cos", "dy_sin", "dy_cos"] - label = ["totalAmount"] - # make sure categorical columns are strings - test_df[catg_cols] = test_df[catg_cols].astype("str") - - # split data - y_test = test_df[label] - X_test = test_df.drop(label, axis=1) - - # load model' - - if args.run_mode =='local': - model_file = "linear_regression.joblib" - model_path=os.path.join(args.model_folder,model_file) - current_model = joblib.load(model_path) - y_pred_current = current_model.predict(X_test) - r2 = r2_score(y_test, y_pred_current) - mape = mean_absolute_percentage_error(y_test, y_pred_current) - rmse = np.sqrt(mean_squared_error(y_test, y_pred_current)) - print("Evaluation finished! Metrics:") - print(f"R2:", r2) - print(f"MAPE:", mape) - print(f"RMSE:", rmse) - - if args.run_mode =='remote': - - for model_file in os.listdir(args.model_folder): - if ".joblib" in model_file: - candidate_model_file=model_file - candidate_model_path=os.path.join(args.model_folder,candidate_model_file) - candidate_model = joblib.load(candidate_model_path) - - y_pred_candidate = candidate_model.predict(X_test) - r2_candidate = r2_score(y_test, y_pred_candidate) - mape_candidate = mean_absolute_percentage_error(y_test, y_pred_candidate) - rmse_candidate = np.sqrt(mean_squared_error(y_test, y_pred_candidate)) - mlflow.log_metric("mape_candidate",mape_candidate) - mlflow.log_metric("r2_candidate",r2_candidate) - mlflow.log_metric("rmse_candidate",rmse_candidate) - - current_model=None - - try: - current_model_aml = Model(ws,args.model_name) - os.makedirs("current_model", exist_ok=True) - current_model_aml.download("current_model",exist_ok=True) - current_model = mlflow.sklearn.load_model(os.path.join("current_model",args.model_name)) - except: - print("Model does not exist") - - if current_model: #current model exist, perform evaluation - # test 2 algorithms - y_pred_current = current_model.predict(X_test) - r2_current = r2_score(y_test, y_pred_current) - mape_current = mean_absolute_percentage_error(y_test, y_pred_current) - rmse_current = np.sqrt(mean_squared_error(y_test, y_pred_current)) - mlflow.log_metric("mape_current",mape_current) - mlflow.log_metric("r2_current",r2_current) - mlflow.log_metric("rmse_current",rmse_current) - if r2_candidate >= r2_current: - print("better model found, registering") - mlflow.sklearn.log_model(candidate_model,args.model_name) - model_uri = f'runs:/{run_id}/{args.model_name}' - mlflow.register_model(model_uri,args.model_name) - - else: - raise Exception("candidate model does not perform better, exiting") - - else: - print("First time model train, registering") - mlflow.sklearn.log_model(candidate_model,args.model_name) - model_uri = f'runs:/{run_id}/{args.model_name}' - mlflow.register_model(model_uri,args.model_name) - -# run script -if __name__ == "__main__": - # parse args - args = parse_args() - - # run main function - main(args) +import pandas as pd +import numpy as np +import os +import argparse +from azureml.core import Run, Dataset,Datastore, Workspace +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.compose import ColumnTransformer +from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error +import joblib +from azureml.core import Model +import mlflow +def parse_args(): + # setup arg parser + parser = argparse.ArgumentParser() + + parser.add_argument("--input_file_name", type=str, default="test_df.parquet") + parser.add_argument("--prep_data", default="data", type=str, help="Path to prepped data") + parser.add_argument("--model_folder", default="data", type=str, help="Path to model data") + parser.add_argument("--model_name",default='nyc_fare_prediction',type=str, help="Name of the model in workspace") + parser.add_argument("--run_mode", type=str, default="local") + + + # parse args + args = parser.parse_args() + + # return args + return args + + + +def main(args): + if args.run_mode =='remote': + run = Run.get_context() + ws = run.experiment.workspace + run_id = run.id + + # read in data + test_df = pd.read_parquet(os.path.join(args.prep_data,args.input_file_name)) + + catg_cols = ["vendorID", "month_num", "day_of_month", "normalizeHolidayName", "isPaidTimeOff"] + # num_cols = ["passengerCount", "tripDistance", "precipTime", "temperature", "precipDepth", "hr_sin", "hr_cos", "dy_sin", "dy_cos"] + label = ["totalAmount"] + # make sure categorical columns are strings + test_df[catg_cols] = test_df[catg_cols].astype("str") + + # split data + y_test = test_df[label] + X_test = test_df.drop(label, axis=1) + + # load model' + + if args.run_mode =='local': + model_file = "linear_regression.joblib" + model_path=os.path.join(args.model_folder,model_file) + current_model = joblib.load(model_path) + y_pred_current = current_model.predict(X_test) + r2 = r2_score(y_test, y_pred_current) + mape = mean_absolute_percentage_error(y_test, y_pred_current) + rmse = np.sqrt(mean_squared_error(y_test, y_pred_current)) + print("Evaluation finished! Metrics:") + print(f"R2:", r2) + print(f"MAPE:", mape) + print(f"RMSE:", rmse) + + if args.run_mode =='remote': + + for model_file in os.listdir(args.model_folder): + if ".joblib" in model_file: + candidate_model_file=model_file + candidate_model_path=os.path.join(args.model_folder,candidate_model_file) + candidate_model = joblib.load(candidate_model_path) + + y_pred_candidate = candidate_model.predict(X_test) + r2_candidate = r2_score(y_test, y_pred_candidate) + mape_candidate = mean_absolute_percentage_error(y_test, y_pred_candidate) + rmse_candidate = np.sqrt(mean_squared_error(y_test, y_pred_candidate)) + mlflow.log_metric("mape_candidate",mape_candidate) + mlflow.log_metric("r2_candidate",r2_candidate) + mlflow.log_metric("rmse_candidate",rmse_candidate) + + current_model=None + + try: + current_model_aml = Model(ws,args.model_name) + os.makedirs("current_model", exist_ok=True) + current_model_aml.download("current_model",exist_ok=True) + current_model = mlflow.sklearn.load_model(os.path.join("current_model",args.model_name)) + except: + print("Model does not exist") + + if current_model: #current model exist, perform evaluation + # test 2 algorithms + y_pred_current = current_model.predict(X_test) + r2_current = r2_score(y_test, y_pred_current) + mape_current = mean_absolute_percentage_error(y_test, y_pred_current) + rmse_current = np.sqrt(mean_squared_error(y_test, y_pred_current)) + mlflow.log_metric("mape_current",mape_current) + mlflow.log_metric("r2_current",r2_current) + mlflow.log_metric("rmse_current",rmse_current) + if r2_candidate >= r2_current: + print("better model found, registering") + mlflow.sklearn.log_model(candidate_model,args.model_name) + model_uri = f'runs:/{run_id}/{args.model_name}' + mlflow.register_model(model_uri,args.model_name) + + else: + raise Exception("candidate model does not perform better, exiting") + + else: + print("First time model train, registering") + mlflow.sklearn.log_model(candidate_model,args.model_name) + model_uri = f'runs:/{run_id}/{args.model_name}' + mlflow.register_model(model_uri,args.model_name) + +# run script +if __name__ == "__main__": + # parse args + args = parse_args() + + # run main function + main(args) diff --git a/src/workshop/core/evaluating/ml_evaluating.yml b/src/workshop/core/evaluating/ml_evaluating.yml index 25c6ff09..9cce3880 100644 --- a/src/workshop/core/evaluating/ml_evaluating.yml +++ b/src/workshop/core/evaluating/ml_evaluating.yml @@ -1,23 +1,23 @@ -$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json -code: ./ -command: >- - python ml_evaluating.py - --prep_data ${{inputs.prep_data}} - --model_folder ${{inputs.model_folder}} - --run_mode ${{inputs.run_mode}} - -inputs: - prep_data: - path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ - model_folder: - path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ - run_mode: "remote" - -environment: - conda_file: ./conda_ml_evaluating.yml - image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest - -compute: azureml:cpu-cluster -display_name: ml-evaluation -experiment_name: ml-evaluation -description: ml-evaluation +$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json +code: ./ +command: >- + python ml_evaluating.py + --prep_data ${{inputs.prep_data}} + --model_folder ${{inputs.model_folder}} + --run_mode ${{inputs.run_mode}} + +inputs: + prep_data: + path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ + model_folder: + path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ + run_mode: "remote" + +environment: + conda_file: ./conda_ml_evaluating.yml + image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest + +compute: azureml:cpu-cluster +display_name: ml-evaluation +experiment_name: ml-evaluation +description: ml-evaluation diff --git a/src/workshop/core/pipelines/.amlignore b/src/workshop/core/pipelines/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/core/pipelines/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/pipelines/.amlignore.amltmp b/src/workshop/core/pipelines/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/core/pipelines/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/pipelines/adf/adf_pipeline.json b/src/workshop/core/pipelines/adf/adf_pipeline.json index 592c104f..21726c3d 100644 --- a/src/workshop/core/pipelines/adf/adf_pipeline.json +++ b/src/workshop/core/pipelines/adf/adf_pipeline.json @@ -1,91 +1,91 @@ -{ - "name": "Azure_SQL_ML_Pipeline", - "properties": { - "activities": [ - { - "name": "copy data from sql", - "type": "Copy", - "dependsOn": [], - "policy": { - "timeout": "0.12:00:00", - "retry": 0, - "retryIntervalInSeconds": 30, - "secureOutput": false, - "secureInput": false - }, - "userProperties": [], - "typeProperties": { - "source": { - "type": "AzureSqlSource", - "sqlReaderQuery": { - "value": "@concat('select * from green_taxi WHERE lpepPickupDatetime >','''',formatDateTime(adddays(utcnow(),-3190), 'yyyy-MM-dd'),'''')\n", - "type": "Expression" - }, - "queryTimeout": "02:00:00", - "partitionOption": "None" - }, - "sink": { - "type": "ParquetSink", - "storeSettings": { - "type": "AzureBlobStorageWriteSettings" - }, - "formatSettings": { - "type": "ParquetWriteSettings" - } - }, - "enableStaging": false, - "translator": { - "type": "TabularTranslator", - "typeConversion": true, - "typeConversionSettings": { - "allowDataTruncation": true, - "treatBooleanAsNumber": false - } - } - }, - "inputs": [ - { - "referenceName": "AzureSqlDemo", - "type": "DatasetReference" - } - ], - "outputs": [ - { - "referenceName": "parquetdata", - "type": "DatasetReference" - } - ] - }, - { - "name": "Machine Learning Execute Pipeline", - "type": "AzureMLExecutePipeline", - "dependsOn": [ - { - "activity": "copy data from sql", - "dependencyConditions": [ - "Succeeded" - ] - } - ], - "policy": { - "timeout": "0.12:00:00", - "retry": 0, - "retryIntervalInSeconds": 30, - "secureOutput": false, - "secureInput": false - }, - "userProperties": [], - "typeProperties": { - "mlPipelineEndpointId": "3337b14a-4a0a-47d3-817b-e88e1e7c68e6" - }, - "linkedServiceName": { - "referenceName": "amlws01ent", - "type": "LinkedServiceReference" - } - } - ], - "annotations": [], - "lastPublishTime": "2022-10-05T21:24:10Z" - }, - "type": "Microsoft.DataFactory/factories/pipelines" +{ + "name": "Azure_SQL_ML_Pipeline", + "properties": { + "activities": [ + { + "name": "copy data from sql", + "type": "Copy", + "dependsOn": [], + "policy": { + "timeout": "0.12:00:00", + "retry": 0, + "retryIntervalInSeconds": 30, + "secureOutput": false, + "secureInput": false + }, + "userProperties": [], + "typeProperties": { + "source": { + "type": "AzureSqlSource", + "sqlReaderQuery": { + "value": "@concat('select * from green_taxi WHERE lpepPickupDatetime >','''',formatDateTime(adddays(utcnow(),-3190), 'yyyy-MM-dd'),'''')\n", + "type": "Expression" + }, + "queryTimeout": "02:00:00", + "partitionOption": "None" + }, + "sink": { + "type": "ParquetSink", + "storeSettings": { + "type": "AzureBlobStorageWriteSettings" + }, + "formatSettings": { + "type": "ParquetWriteSettings" + } + }, + "enableStaging": false, + "translator": { + "type": "TabularTranslator", + "typeConversion": true, + "typeConversionSettings": { + "allowDataTruncation": true, + "treatBooleanAsNumber": false + } + } + }, + "inputs": [ + { + "referenceName": "AzureSqlDemo", + "type": "DatasetReference" + } + ], + "outputs": [ + { + "referenceName": "parquetdata", + "type": "DatasetReference" + } + ] + }, + { + "name": "Machine Learning Execute Pipeline", + "type": "AzureMLExecutePipeline", + "dependsOn": [ + { + "activity": "copy data from sql", + "dependencyConditions": [ + "Succeeded" + ] + } + ], + "policy": { + "timeout": "0.12:00:00", + "retry": 0, + "retryIntervalInSeconds": 30, + "secureOutput": false, + "secureInput": false + }, + "userProperties": [], + "typeProperties": { + "mlPipelineEndpointId": "3337b14a-4a0a-47d3-817b-e88e1e7c68e6" + }, + "linkedServiceName": { + "referenceName": "amlws01ent", + "type": "LinkedServiceReference" + } + } + ], + "annotations": [], + "lastPublishTime": "2022-10-05T21:24:10Z" + }, + "type": "Microsoft.DataFactory/factories/pipelines" } \ No newline at end of file diff --git a/src/workshop/core/pipelines/batch_scoring_pipeline.yml b/src/workshop/core/pipelines/batch_scoring_pipeline.yml index f1464887..49f0bc59 100644 --- a/src/workshop/core/pipelines/batch_scoring_pipeline.yml +++ b/src/workshop/core/pipelines/batch_scoring_pipeline.yml @@ -1,54 +1,54 @@ -$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json -type: pipeline -display_name: MLOps-Batch-Scoring-Pipeline -compute: azureml:cpu-cluster -settings: - force_rerun: true -jobs: - data_engineering: - type: command - component: ./data_engineering_comp.yml - inputs: - input_folder: - type: uri_folder - mode: ro_mount - path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/batch_scoring/inputs - outputs: - output_folder: - type: mltable - mode: rw_mount - scoring: - type: parallel - mini_batch_size: "1" - mini_batch_error_threshold: -1 - max_concurrency_per_instance: 2 - retry_settings: - max_retries: 1 - timeout: 60 - resources: - instance_count: 2 - inputs: - scoring_data_folder: - type: mltable - mode: eval_mount - path: ${{parent.jobs.data_engineering.outputs.output_folder}} - outputs: - predictions_data_folder: - type: uri_folder - mode: rw_mount - path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/batch_scoring/predictions - prediction_log: - type: uri_file - mode: rw_mount - input_data: ${{inputs.scoring_data_folder}} - task: - type: function - code: ../scoring/batch_scoring - entry_script: batch_score.py - environment: - name: mlops_batchscoring - conda_file: ../scoring/conda.yml - image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest - program_arguments: --predictions_data_folder ${{outputs.predictions_data_folder}} - append_row_to: ${{outputs.prediction_log}} - +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json +type: pipeline +display_name: MLOps-Batch-Scoring-Pipeline +compute: azureml:cpu-cluster +settings: + force_rerun: true +jobs: + data_engineering: + type: command + component: ./data_engineering_comp.yml + inputs: + input_folder: + type: uri_folder + mode: ro_mount + path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/batch_scoring/inputs + outputs: + output_folder: + type: mltable + mode: rw_mount + scoring: + type: parallel + mini_batch_size: "1" + mini_batch_error_threshold: -1 + max_concurrency_per_instance: 2 + retry_settings: + max_retries: 1 + timeout: 60 + resources: + instance_count: 2 + inputs: + scoring_data_folder: + type: mltable + mode: eval_mount + path: ${{parent.jobs.data_engineering.outputs.output_folder}} + outputs: + predictions_data_folder: + type: uri_folder + mode: rw_mount + path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/batch_scoring/predictions + prediction_log: + type: uri_file + mode: rw_mount + input_data: ${{inputs.scoring_data_folder}} + task: + type: function + code: ../scoring/batch_scoring + entry_script: batch_score.py + environment: + name: mlops_batchscoring + conda_file: ../scoring/conda.yml + image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest + program_arguments: --predictions_data_folder ${{outputs.predictions_data_folder}} + append_row_to: ${{outputs.prediction_log}} + diff --git a/src/workshop/core/pipelines/data_engineering_comp.yml b/src/workshop/core/pipelines/data_engineering_comp.yml index 75958db7..f8f2febe 100644 --- a/src/workshop/core/pipelines/data_engineering_comp.yml +++ b/src/workshop/core/pipelines/data_engineering_comp.yml @@ -1,19 +1,19 @@ -$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json -name: data_engineering -display_name: data engineering -version: 10 -type: command -code: ../scoring/batch_scoring -command: >- - python data_engineering.py --input_folder ${{inputs.input_folder}} --output_folder ${{outputs.output_folder}}; -inputs: - input_folder: - type: uri_folder -outputs: - output_folder: - type: mltable -is_deterministic: false -environment: - name: mlops_batchscoring - conda_file: ../scoring/conda.yml +$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json +name: data_engineering +display_name: data engineering +version: 10 +type: command +code: ../scoring/batch_scoring +command: >- + python data_engineering.py --input_folder ${{inputs.input_folder}} --output_folder ${{outputs.output_folder}}; +inputs: + input_folder: + type: uri_folder +outputs: + output_folder: + type: mltable +is_deterministic: false +environment: + name: mlops_batchscoring + conda_file: ../scoring/conda.yml image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest \ No newline at end of file diff --git a/src/workshop/core/pipelines/training_pipeline.yml b/src/workshop/core/pipelines/training_pipeline.yml index 2b6707f1..d395c801 100644 --- a/src/workshop/core/pipelines/training_pipeline.yml +++ b/src/workshop/core/pipelines/training_pipeline.yml @@ -1,71 +1,71 @@ -$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json -type: pipeline -display_name: Training_pipeline -experiment_name: Training_pipeline -compute: azureml:cpu-cluster - -jobs: - prep_job: - type: command - code: ../data_engineering - command: >- - python feature_engineering.py - --input_folder ${{inputs.input_folder}} - --prep_data ${{outputs.prep_data}} - --run_mode ${{inputs.run_mode}} - inputs: - input_folder: - type: uri_folder - path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ - run_mode: "remote" - outputs: - prep_data: - type: uri_folder - path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ - mode: rw_mount - environment: - conda_file: ../data_engineering/conda_feature_engineering.yml - image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest - description: Feature Engineering - - train_job: - type: command - code: ../training - command: >- - python ml_training.py - --prep_data ${{inputs.prep_data}} - --model_folder ${{outputs.model_folder}} - --run_mode ${{inputs.run_mode}} - inputs: - prep_data: ${{parent.jobs.prep_job.outputs.prep_data}} - run_mode: "remote" - outputs: - model_folder: - type: uri_folder - path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ - mode: rw_mount - environment: - conda_file: ../training/conda_ml_training.yml - image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest - description: ML Training - - evaluate_job: - type: command - code: ../evaluating - command: >- - python ml_evaluating.py - --run_mode ${{inputs.run_mode}} - --model_name ${{inputs.model_name}} - --prep_data ${{inputs.prep_data}} - --model_folder ${{inputs.model_folder}} - inputs: - run_mode: "remote" - model_name: "nyc_fare_prediction" - prep_data: ${{parent.jobs.prep_job.outputs.prep_data}} - model_folder: ${{parent.jobs.train_job.outputs.model_folder}} - - environment: - conda_file: ../evaluating/conda_ml_evaluating.yml - image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest - description: model-evaluation - +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json +type: pipeline +display_name: Training_pipeline +experiment_name: Training_pipeline +compute: azureml:cpu-cluster + +jobs: + prep_job: + type: command + code: ../data_engineering + command: >- + python feature_engineering.py + --input_folder ${{inputs.input_folder}} + --prep_data ${{outputs.prep_data}} + --run_mode ${{inputs.run_mode}} + inputs: + input_folder: + type: uri_folder + path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ + run_mode: "remote" + outputs: + prep_data: + type: uri_folder + path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ + mode: rw_mount + environment: + conda_file: ../data_engineering/conda_feature_engineering.yml + image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest + description: Feature Engineering + + train_job: + type: command + code: ../training + command: >- + python ml_training.py + --prep_data ${{inputs.prep_data}} + --model_folder ${{outputs.model_folder}} + --run_mode ${{inputs.run_mode}} + inputs: + prep_data: ${{parent.jobs.prep_job.outputs.prep_data}} + run_mode: "remote" + outputs: + model_folder: + type: uri_folder + path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ + mode: rw_mount + environment: + conda_file: ../training/conda_ml_training.yml + image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest + description: ML Training + + evaluate_job: + type: command + code: ../evaluating + command: >- + python ml_evaluating.py + --run_mode ${{inputs.run_mode}} + --model_name ${{inputs.model_name}} + --prep_data ${{inputs.prep_data}} + --model_folder ${{inputs.model_folder}} + inputs: + run_mode: "remote" + model_name: "nyc_fare_prediction" + prep_data: ${{parent.jobs.prep_job.outputs.prep_data}} + model_folder: ${{parent.jobs.train_job.outputs.model_folder}} + + environment: + conda_file: ../evaluating/conda_ml_evaluating.yml + image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest + description: model-evaluation + diff --git a/src/workshop/core/scoring/.amlignore b/src/workshop/core/scoring/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/core/scoring/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/scoring/.amlignore.amltmp b/src/workshop/core/scoring/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/core/scoring/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/scoring/batch_scoring/batch_score.py b/src/workshop/core/scoring/batch_scoring/batch_score.py index 0aadc6d4..ae53dc7b 100644 --- a/src/workshop/core/scoring/batch_scoring/batch_score.py +++ b/src/workshop/core/scoring/batch_scoring/batch_score.py @@ -1,48 +1,48 @@ - -import os -import tempfile -import logging -from azureml.core.model import Model -import pickle -import pandas as pd -from azureml.core import Run -import os -import mlflow -import argparse,os,datetime - -def init(): - global model,predictions_data_folder - parser = argparse.ArgumentParser() - parser.add_argument("--predictions_data_folder", type=str) - parser.add_argument("--model_name",default='nyc_fare_prediction',type=str, help="Name of the model in workspace") - args, unknown = parser.parse_known_args() - predictions_data_folder = args.predictions_data_folder - print("predictions_data_folder",predictions_data_folder) - current_run = Run.get_context() - ws = current_run.experiment.workspace - model = Model(ws,args.model_name) - model.download(exist_ok=True) - model = mlflow.sklearn.load_model(args.model_name) - -def run(mini_batch): - - - print(f'run method start: {__file__}, run({mini_batch})') - i =0 - for file in mini_batch: - # prepare each image - data = pd.read_parquet(file) - print("data shape ", data.shape) - predictions = model.predict(data) - data["prediction"] =predictions - today = datetime.datetime.today() - year = today.year - month = today.month - day = today.day - folder = "{:02d}-{:02d}-{:4d}".format(month,day,year) - os.makedirs(predictions_data_folder+"/"+folder, exist_ok=True) - data.to_csv(predictions_data_folder+"/"+folder+"/prediction.csv") - i+=1 - - - return [1]*i + +import os +import tempfile +import logging +from azureml.core.model import Model +import pickle +import pandas as pd +from azureml.core import Run +import os +import mlflow +import argparse,os,datetime + +def init(): + global model,predictions_data_folder + parser = argparse.ArgumentParser() + parser.add_argument("--predictions_data_folder", type=str) + parser.add_argument("--model_name",default='nyc_fare_prediction',type=str, help="Name of the model in workspace") + args, unknown = parser.parse_known_args() + predictions_data_folder = args.predictions_data_folder + print("predictions_data_folder",predictions_data_folder) + current_run = Run.get_context() + ws = current_run.experiment.workspace + model = Model(ws,args.model_name) + model.download(exist_ok=True) + model = mlflow.sklearn.load_model(args.model_name) + +def run(mini_batch): + + + print(f'run method start: {__file__}, run({mini_batch})') + i =0 + for file in mini_batch: + # prepare each image + data = pd.read_parquet(file) + print("data shape ", data.shape) + predictions = model.predict(data) + data["prediction"] =predictions + today = datetime.datetime.today() + year = today.year + month = today.month + day = today.day + folder = "{:02d}-{:02d}-{:4d}".format(month,day,year) + os.makedirs(predictions_data_folder+"/"+folder, exist_ok=True) + data.to_csv(predictions_data_folder+"/"+folder+"/prediction.csv") + i+=1 + + + return [1]*i diff --git a/src/workshop/core/scoring/batch_scoring/conda.yml b/src/workshop/core/scoring/batch_scoring/conda.yml index d1e053de..ab631829 100644 --- a/src/workshop/core/scoring/batch_scoring/conda.yml +++ b/src/workshop/core/scoring/batch_scoring/conda.yml @@ -1,11 +1,11 @@ -name: workshop-online-scoring -channels: - - conda-forge -dependencies: - - python=3.8.12 - - pip=21.3.1 - - pip: - - azureml-mlflow==1.38.0 - - azureml-defaults==1.38.0 - - pandas==1.3.5 +name: workshop-online-scoring +channels: + - conda-forge +dependencies: + - python=3.8.12 + - pip=21.3.1 + - pip: + - azureml-mlflow==1.38.0 + - azureml-defaults==1.38.0 + - pandas==1.3.5 - scikit-learn==1.0.2 \ No newline at end of file diff --git a/src/workshop/core/scoring/batch_scoring/data_engineering.py b/src/workshop/core/scoring/batch_scoring/data_engineering.py index 3178ed45..fb945983 100644 --- a/src/workshop/core/scoring/batch_scoring/data_engineering.py +++ b/src/workshop/core/scoring/batch_scoring/data_engineering.py @@ -1,121 +1,121 @@ -import pandas as pd -import numpy as np -from datetime import datetime -import argparse -import os - -import argparse,os -import pandas as pd -import datetime -# data engineering - -# read arguments - -def parse_args(): - # setup arg parser - parser = argparse.ArgumentParser() - - - # add arguments - parser.add_argument("--nyc_file_name", type=str, default="green_taxi.parquet") - parser.add_argument("--public_holiday_file_name", type=str, default="holidays.parquet") - parser.add_argument("--weather_file_name", type=str, default="weather.parquet") - parser.add_argument('--input_folder', type=str) - parser.add_argument('--output_folder', type=str) - - # parse args - args = parser.parse_args() - - # return args - return args - - -def build_time_features(vector): - pickup_datetime = vector[0] - month_num = pickup_datetime.month - day_of_month = pickup_datetime.day - day_of_week = pickup_datetime.weekday() - hour_of_day = pickup_datetime.hour - country_code = "US" - hr_sin = np.sin(hour_of_day*(2.*np.pi/24)) - hr_cos = np.cos(hour_of_day*(2.*np.pi/24)) - dy_sin = np.sin(day_of_week*(2.*np.pi/7)) - dy_cos = np.cos(day_of_week*(2.*np.pi/7)) - - return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos)) -def engineer_features(green_taxi_df,holidays_df,weather_df ): - - green_taxi_df[["month_num", "day_of_month","day_of_week", "hour_of_day", "country_code", "hr_sin", "hr_cos", "dy_sin", "dy_cos"]] = \ - green_taxi_df[["lpepPickupDatetime"]].apply(build_time_features, axis=1) - - columns_to_remove = ["lpepDropoffDatetime", "puLocationId", "doLocationId", "extra", "mtaTax", - "improvementSurcharge", "tollsAmount", "ehailFee", "tripType", "rateCodeID", - "storeAndFwdFlag", "paymentType", "fareAmount", "tipAmount"] - - green_taxi_df.drop(columns_to_remove, axis=1, inplace=True) - - - green_taxi_df["datetime"] = green_taxi_df["lpepPickupDatetime"].dt.normalize() - - - holidays_df = holidays_df.rename(columns={"countryRegionCode": "country_code"}) - holidays_df["datetime"] = holidays_df["date"].dt.normalize() - - holidays_df.drop(["countryOrRegion", "holidayName", "date"], axis=1, inplace=True) - - taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how="left", on=["datetime", "country_code"]) - taxi_holidays_df[taxi_holidays_df["normalizeHolidayName"].notnull()] - - - - weather_df["datetime"] = weather_df["datetime"].dt.normalize() - - # group by datetime - aggregations = {"precipTime": "max", "temperature": "mean", "precipDepth": "max"} - weather_df_grouped = weather_df.groupby("datetime").agg(aggregations) - - taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how="left", on=["datetime"]) - - final_df = taxi_holidays_weather_df.query("pickupLatitude>=40.53 and pickupLatitude<=40.88 and \ - pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \ - tripDistance>0 and tripDistance<75 and \ - passengerCount>0 and passengerCount<100") - return final_df - -def main(args): - - # read in data - today = datetime.datetime.today() - year = today.year - month = today.month - day = today.day - folder = "{:02d}-{:02d}-{:4d}".format(month,day,year) - green_taxi_df = pd.read_parquet(os.path.join(args.input_folder,folder, args.nyc_file_name)) - - - holidays_df = pd.read_parquet(os.path.join(args.input_folder,folder, args.public_holiday_file_name)) - - weather_df = pd.read_parquet(os.path.join(args.input_folder,folder,args.weather_file_name)) - - final_df = engineer_features(green_taxi_df, holidays_df, weather_df) - # if os.path.exists(args.output_folder): - # os.remove(args.output_folder) - - final_df.to_parquet(args.output_folder+"/data.parquet") - print("done writing data") - ml_table_content = """ -paths: - - pattern: ./*.parquet - """ - with open(os.path.join(args.output_folder,"MLTable"),'w') as mltable_file: - mltable_file.writelines(ml_table_content) - - - -# run script -if __name__ == "__main__": - # parse args - args = parse_args() - - # run main function - main(args) +import pandas as pd +import numpy as np +from datetime import datetime +import argparse +import os + +import argparse,os +import pandas as pd +import datetime +# data engineering + +# read arguments + +def parse_args(): + # setup arg parser + parser = argparse.ArgumentParser() + + + # add arguments + parser.add_argument("--nyc_file_name", type=str, default="green_taxi.parquet") + parser.add_argument("--public_holiday_file_name", type=str, default="holidays.parquet") + parser.add_argument("--weather_file_name", type=str, default="weather.parquet") + parser.add_argument('--input_folder', type=str) + parser.add_argument('--output_folder', type=str) + + # parse args + args = parser.parse_args() + + # return args + return args + + +def build_time_features(vector): + pickup_datetime = vector[0] + month_num = pickup_datetime.month + day_of_month = pickup_datetime.day + day_of_week = pickup_datetime.weekday() + hour_of_day = pickup_datetime.hour + country_code = "US" + hr_sin = np.sin(hour_of_day*(2.*np.pi/24)) + hr_cos = np.cos(hour_of_day*(2.*np.pi/24)) + dy_sin = np.sin(day_of_week*(2.*np.pi/7)) + dy_cos = np.cos(day_of_week*(2.*np.pi/7)) + + return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos)) +def engineer_features(green_taxi_df,holidays_df,weather_df ): + + green_taxi_df[["month_num", "day_of_month","day_of_week", "hour_of_day", "country_code", "hr_sin", "hr_cos", "dy_sin", "dy_cos"]] = \ + green_taxi_df[["lpepPickupDatetime"]].apply(build_time_features, axis=1) + + columns_to_remove = ["lpepDropoffDatetime", "puLocationId", "doLocationId", "extra", "mtaTax", + "improvementSurcharge", "tollsAmount", "ehailFee", "tripType", "rateCodeID", + "storeAndFwdFlag", "paymentType", "fareAmount", "tipAmount"] + + green_taxi_df.drop(columns_to_remove, axis=1, inplace=True) + + + green_taxi_df["datetime"] = green_taxi_df["lpepPickupDatetime"].dt.normalize() + + + holidays_df = holidays_df.rename(columns={"countryRegionCode": "country_code"}) + holidays_df["datetime"] = holidays_df["date"].dt.normalize() + + holidays_df.drop(["countryOrRegion", "holidayName", "date"], axis=1, inplace=True) + + taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how="left", on=["datetime", "country_code"]) + taxi_holidays_df[taxi_holidays_df["normalizeHolidayName"].notnull()] + + + + weather_df["datetime"] = weather_df["datetime"].dt.normalize() + + # group by datetime + aggregations = {"precipTime": "max", "temperature": "mean", "precipDepth": "max"} + weather_df_grouped = weather_df.groupby("datetime").agg(aggregations) + + taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how="left", on=["datetime"]) + + final_df = taxi_holidays_weather_df.query("pickupLatitude>=40.53 and pickupLatitude<=40.88 and \ + pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \ + tripDistance>0 and tripDistance<75 and \ + passengerCount>0 and passengerCount<100") + return final_df + +def main(args): + + # read in data + today = datetime.datetime.today() + year = today.year + month = today.month + day = today.day + folder = "{:02d}-{:02d}-{:4d}".format(month,day,year) + green_taxi_df = pd.read_parquet(os.path.join(args.input_folder,folder, args.nyc_file_name)) + + + holidays_df = pd.read_parquet(os.path.join(args.input_folder,folder, args.public_holiday_file_name)) + + weather_df = pd.read_parquet(os.path.join(args.input_folder,folder,args.weather_file_name)) + + final_df = engineer_features(green_taxi_df, holidays_df, weather_df) + # if os.path.exists(args.output_folder): + # os.remove(args.output_folder) + + final_df.to_parquet(args.output_folder+"/data.parquet") + print("done writing data") + ml_table_content = """ +paths: + - pattern: ./*.parquet + """ + with open(os.path.join(args.output_folder,"MLTable"),'w') as mltable_file: + mltable_file.writelines(ml_table_content) + + + +# run script +if __name__ == "__main__": + # parse args + args = parse_args() + + # run main function + main(args) diff --git a/src/workshop/core/scoring/conda.yml b/src/workshop/core/scoring/conda.yml index d1e053de..ab631829 100644 --- a/src/workshop/core/scoring/conda.yml +++ b/src/workshop/core/scoring/conda.yml @@ -1,11 +1,11 @@ -name: workshop-online-scoring -channels: - - conda-forge -dependencies: - - python=3.8.12 - - pip=21.3.1 - - pip: - - azureml-mlflow==1.38.0 - - azureml-defaults==1.38.0 - - pandas==1.3.5 +name: workshop-online-scoring +channels: + - conda-forge +dependencies: + - python=3.8.12 + - pip=21.3.1 + - pip: + - azureml-mlflow==1.38.0 + - azureml-defaults==1.38.0 + - pandas==1.3.5 - scikit-learn==1.0.2 \ No newline at end of file diff --git a/src/workshop/core/scoring/deployment.yml b/src/workshop/core/scoring/deployment.yml index 29c3500c..d3e22e52 100644 --- a/src/workshop/core/scoring/deployment.yml +++ b/src/workshop/core/scoring/deployment.yml @@ -1,12 +1,14 @@ -$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json -name: green -endpoint_name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml -model: azureml:nyc_fare_prediction:1 -code_configuration: - code: ./ - scoring_script: score.py -environment: - conda_file: ./conda.yml - image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1 -instance_type: Standard_DS2_V2 -instance_count: 1 +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json +name: blue +description: pilot deployment v1.7 +endpoint_name: anildwa1-mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml +#egress_public_network_access: disabled +model: azureml:nyc_fare_prediction:1 +code_configuration: + code: ./ + scoring_script: score.py +environment: + conda_file: ./conda.yml + image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1 +instance_type: Standard_DS1_v2 +instance_count: 1 diff --git a/src/workshop/core/scoring/deployment.yml.amltmp b/src/workshop/core/scoring/deployment.yml.amltmp new file mode 100644 index 00000000..fea6ea08 --- /dev/null +++ b/src/workshop/core/scoring/deployment.yml.amltmp @@ -0,0 +1,12 @@ +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json +name: green +endpoint_name: anildwa-mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml +model: azureml:nyc_fare_prediction:1 +code_configuration: + code: ./ + scoring_script: score.py +environment: + conda_file: ./conda.yml + image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1 +instance_type: Standard_DS2_V2 +instance_count: 1 diff --git a/src/workshop/core/scoring/endpoint.yml b/src/workshop/core/scoring/endpoint.yml index 611e0721..e07cfa9b 100644 --- a/src/workshop/core/scoring/endpoint.yml +++ b/src/workshop/core/scoring/endpoint.yml @@ -1,3 +1,4 @@ -$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json -name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique -auth_mode: key +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json +name: anildwa1-mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique +auth_mode: key +#public_network_access: disabled \ No newline at end of file diff --git a/src/workshop/core/scoring/endpoint.yml.amltmp b/src/workshop/core/scoring/endpoint.yml.amltmp new file mode 100644 index 00000000..45cb370e --- /dev/null +++ b/src/workshop/core/scoring/endpoint.yml.amltmp @@ -0,0 +1,3 @@ +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json +name: anildwa-mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique +auth_mode: key diff --git a/src/workshop/core/scoring/score.py b/src/workshop/core/scoring/score.py index 7debf93b..958d2369 100644 --- a/src/workshop/core/scoring/score.py +++ b/src/workshop/core/scoring/score.py @@ -1,25 +1,26 @@ -import json -import numpy as np -import pandas as pd -import os -from azureml.core.model import Model -import mlflow -# Called when the service is loaded -def init(): - global model - # Get the path to the deployed model file and load it - model_dir =os.getenv('AZUREML_MODEL_DIR') - model_file = os.listdir(model_dir)[0] - model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), model_file) - model = mlflow.sklearn.load_model(model_path) -# Called when a request is received -def run(raw_data): - try: - # Get the input data - data=pd.DataFrame(json.loads(raw_data)['data']) - # Get a prediction from the model - predictions = model.predict(data) - return json.dumps(predictions.tolist()) - except Exception as e: - error= str(e) +import json +import numpy as np +import pandas as pd +import os +from azureml.core.model import Model +import mlflow + +# Called when the service is loaded +def init(): + global model + # Get the path to the deployed model file and load it + model_dir =os.getenv('AZUREML_MODEL_DIR') + model_file = os.listdir(model_dir)[0] + model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), model_file) + model = mlflow.sklearn.load_model(model_path) +# Called when a request is received +def run(raw_data): + try: + # Get the input data + data=pd.DataFrame(json.loads(raw_data)['data']) + # Get a prediction from the model + predictions = model.predict(data) + return json.dumps(predictions.tolist()) + except Exception as e: + error= str(e) return json.dumps(error) \ No newline at end of file diff --git a/src/workshop/core/scoring/scoring_test_request.json b/src/workshop/core/scoring/scoring_test_request.json index 8bc5f27f..e49e83ee 100644 --- a/src/workshop/core/scoring/scoring_test_request.json +++ b/src/workshop/core/scoring/scoring_test_request.json @@ -1,27 +1,27 @@ -{ - "data": { - "vendorID": {"715": "2", "3633": "2"}, - "lpepPickupDatetime": {"715": "2016-01-04 20:48:38","3633": "2016-02-15 20:35:58"}, - "passengerCount": {"715": "1", "3633": "1"}, - "tripDistance": {"715": "1.14", "3633": "6.43"}, - "pickupLongitude": {"715": "-73.97727966308594", "3633": "-73.95679473876953"}, - "pickupLatitude": {"715": "40.68115234375", "3633": "40.74812316894531"}, - "dropoffLongitude": {"715": "-73.96723175048828", "3633": "-73.9059066772461"}, - "dropoffLatitude": {"715": "40.67363739013672", "3633": "40.76784896850586"}, - "month_num": {"715": "1", "3633": "2"}, - "day_of_month": {"715": "4", "3633": "15"}, - "day_of_week": {"715": "0", "3633": "0"}, - "hour_of_day": {"715": "20", "3633": "20"}, - "country_code": {"715": "US", "3633": "US"}, - "hr_sin": {"715": "-0.866025403784439", "3633": "-0.866025403784439"}, - "hr_cos": {"715": "0.4999999999999992", "3633": "0.4999999999999992"}, - "dy_sin": {"715": "0.0", "3633": "0.0"}, - "dy_cos": {"715": "1.0", "3633": "1.0"}, - "datetime": {"715": "2016-01-04 00:00:00", "3633": "2016-02-15 00:00:00"}, - "normalizeHolidayName": {"715": "nan", "3633": "Washington's Birthday"}, - "isPaidTimeOff": {"715": "nan", "3633": "True"}, - "precipTime": {"715": "1.0", "3633": "24.0"}, - "temperature": {"715": "0.12389380530973423", "3633": "-6.222602739726026"}, - "precipDepth": {"715": "0.0", "3633": "9999.0"} - } +{ + "data": { + "vendorID": {"715": "2", "3633": "2"}, + "lpepPickupDatetime": {"715": "2016-01-04 20:48:38","3633": "2016-02-15 20:35:58"}, + "passengerCount": {"715": "1", "3633": "1"}, + "tripDistance": {"715": "1.14", "3633": "6.43"}, + "pickupLongitude": {"715": "-73.97727966308594", "3633": "-73.95679473876953"}, + "pickupLatitude": {"715": "40.68115234375", "3633": "40.74812316894531"}, + "dropoffLongitude": {"715": "-73.96723175048828", "3633": "-73.9059066772461"}, + "dropoffLatitude": {"715": "40.67363739013672", "3633": "40.76784896850586"}, + "month_num": {"715": "1", "3633": "2"}, + "day_of_month": {"715": "4", "3633": "15"}, + "day_of_week": {"715": "0", "3633": "0"}, + "hour_of_day": {"715": "20", "3633": "20"}, + "country_code": {"715": "US", "3633": "US"}, + "hr_sin": {"715": "-0.866025403784439", "3633": "-0.866025403784439"}, + "hr_cos": {"715": "0.4999999999999992", "3633": "0.4999999999999992"}, + "dy_sin": {"715": "0.0", "3633": "0.0"}, + "dy_cos": {"715": "1.0", "3633": "1.0"}, + "datetime": {"715": "2016-01-04 00:00:00", "3633": "2016-02-15 00:00:00"}, + "normalizeHolidayName": {"715": "nan", "3633": "Washington's Birthday"}, + "isPaidTimeOff": {"715": "nan", "3633": "True"}, + "precipTime": {"715": "1.0", "3633": "24.0"}, + "temperature": {"715": "0.12389380530973423", "3633": "-6.222602739726026"}, + "precipDepth": {"715": "0.0", "3633": "9999.0"} + } } \ No newline at end of file diff --git a/src/workshop/core/training/.amlignore b/src/workshop/core/training/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/core/training/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/training/.amlignore.amltmp b/src/workshop/core/training/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/core/training/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/training/conda_ml_training.yml b/src/workshop/core/training/conda_ml_training.yml index 3e26a9f2..f28b0452 100644 --- a/src/workshop/core/training/conda_ml_training.yml +++ b/src/workshop/core/training/conda_ml_training.yml @@ -1,11 +1,11 @@ -name: ml-training -channels: - - conda-forge -dependencies: - - python=3.8 - - pip=21.3.1 - - pip: - - azureml-sdk==1.38.0 - - azureml-mlflow==1.38.0 - - pandas==1.3.5 +name: ml-training +channels: + - conda-forge +dependencies: + - python=3.8 + - pip=21.3.1 + - pip: + - azureml-sdk==1.38.0 + - azureml-mlflow==1.38.0 + - pandas==1.3.5 - scikit-learn==1.0.2 \ No newline at end of file diff --git a/src/workshop/core/training/ml_training.py b/src/workshop/core/training/ml_training.py index 6f59dcdd..e50ab542 100644 --- a/src/workshop/core/training/ml_training.py +++ b/src/workshop/core/training/ml_training.py @@ -1,103 +1,106 @@ -import pandas as pd -import numpy as np -import os -import argparse -import mlflow -import mlflow.sklearn -from azureml.core import Run, Dataset,Datastore, Workspace -from sklearn.linear_model import LinearRegression -from sklearn.ensemble import RandomForestRegressor -from sklearn.linear_model import Ridge -from sklearn.model_selection import train_test_split -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import OneHotEncoder -from sklearn.impute import SimpleImputer -from sklearn.compose import ColumnTransformer -from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error -import joblib -def parse_args(): - # arg parser - parser = argparse.ArgumentParser() - - parser.add_argument("--prep_data", default="data", type=str, help="Path to prepped data, default to local folder") - parser.add_argument("--model_folder", type=str,default="data", help="Path of model ouput folder, default to local folder") - parser.add_argument("--input_file_name", type=str, default="final_df.parquet") - parser.add_argument("--run_mode", type=str, default="local") - - - # parse args - args = parser.parse_args() - - # return args - return args - - -def createClassModel(algo_name, catg, nums): - numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]) - - categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="MISSING")), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) - - preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)]) - - if algo_name == 'linear_regression': - #--------------------------------------------- - #setup: Update alpha value - #--------------------------------------------- - model = Ridge(alpha=100000) #setup - elif algo_name == 'random_forest': - model = RandomForestRegressor() - else: - pass - - ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), ("model", model)]) - - return ModelPipeline - -def main(args): - - # read in data - final_df = pd.read_parquet(os.path.join(args.prep_data,args.input_file_name)) - catg_cols = ["vendorID", "month_num", "day_of_month", "normalizeHolidayName", "isPaidTimeOff"] - num_cols = ["passengerCount", "tripDistance", "precipTime", "temperature", "precipDepth", "hr_sin", "hr_cos", "dy_sin", "dy_cos"] - label = ["totalAmount"] - # make sure categorical columns are strings - final_df[catg_cols] = final_df[catg_cols].astype("str") - - # split data - X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222) - - # test 2 algorithms - os.makedirs(args.model_folder, exist_ok=True) - - algorithmname = "linear_regression" - fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline - fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine - - y_pred = fitPipeline.predict(X_test) # score with fitted pipeline - - # Evaluate - r2 = r2_score(y_test, y_pred) - mape = mean_absolute_percentage_error(y_test, y_pred) - rmse = np.sqrt(mean_squared_error(y_test, y_pred)) - - - joblib.dump(fitPipeline,args.model_folder+"/"+algorithmname+".joblib") - - print("Training finished!. Metrics:") - print(f"R2_{algorithmname}", r2) - print(f"MAPE_{algorithmname}", mape) - print(f"RMSE_{algorithmname}", rmse) - print("Model",args.model_folder+"/"+algorithmname+".joblib","saved!") - - if args.run_mode == 'remote': - mlflow.log_metric(f"R2_{algorithmname}", r2) - mlflow.log_metric(f"MAPE_{algorithmname}", mape) - mlflow.log_metric(f"RMSE_{algorithmname}", rmse) - mlflow.sklearn.log_model(fitPipeline,f"{algorithmname}_model") - -# run script -if __name__ == "__main__": - # parse args - args = parse_args() - # run main function +import pandas as pd +import numpy as np +import os +import argparse +import mlflow +import mlflow.sklearn +from azureml.core import Run, Dataset,Datastore, Workspace +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import Ridge +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.compose import ColumnTransformer +from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error +import joblib +def parse_args(): + # arg parser + parser = argparse.ArgumentParser() + + parser.add_argument("--prep_data", default="data", type=str, help="Path to prepped data, default to local folder") + parser.add_argument("--model_folder", type=str,default="data", help="Path of model ouput folder, default to local folder") + parser.add_argument("--input_file_name", type=str, default="final_df.parquet") + parser.add_argument("--run_mode", type=str, default="local") + + + # parse args + args = parser.parse_args() + + # return args + return args + + + +## happy path changes + +def createClassModel(algo_name, catg, nums): + numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]) + + categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="MISSING")), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) + + preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)]) + + if algo_name == 'linear_regression': + #--------------------------------------------- + #setup: Update alpha value + #--------------------------------------------- + model = Ridge(alpha=1000) #setup + elif algo_name == 'random_forest': + model = RandomForestRegressor() + else: + pass + + ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), ("model", model)]) + + return ModelPipeline + +def main(args): + + # read in data + final_df = pd.read_parquet(os.path.join(args.prep_data,args.input_file_name)) + catg_cols = ["vendorID", "month_num", "day_of_month", "normalizeHolidayName", "isPaidTimeOff"] + num_cols = ["passengerCount", "tripDistance", "precipTime", "temperature", "precipDepth", "hr_sin", "hr_cos", "dy_sin", "dy_cos"] + label = ["totalAmount"] + # make sure categorical columns are strings + final_df[catg_cols] = final_df[catg_cols].astype("str") + + # split data + X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222) + + # test 2 algorithms + os.makedirs(args.model_folder, exist_ok=True) + + algorithmname = "linear_regression" + fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline + fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine + + y_pred = fitPipeline.predict(X_test) # score with fitted pipeline + + # Evaluate + r2 = r2_score(y_test, y_pred) + mape = mean_absolute_percentage_error(y_test, y_pred) + rmse = np.sqrt(mean_squared_error(y_test, y_pred)) + + + joblib.dump(fitPipeline,args.model_folder+"/"+algorithmname+".joblib") + + print("Training finished!. Metrics:") + print(f"R2_{algorithmname}", r2) + print(f"MAPE_{algorithmname}", mape) + print(f"RMSE_{algorithmname}", rmse) + print("Model",args.model_folder+"/"+algorithmname+".joblib","saved!") + + if args.run_mode == 'remote': + mlflow.log_metric(f"R2_{algorithmname}", r2) + mlflow.log_metric(f"MAPE_{algorithmname}", mape) + mlflow.log_metric(f"RMSE_{algorithmname}", rmse) + mlflow.sklearn.log_model(fitPipeline,f"{algorithmname}_model") + +# run script +if __name__ == "__main__": + # parse args + args = parse_args() + # run main function main(args) \ No newline at end of file diff --git a/src/workshop/core/training/ml_training.py.amltmp b/src/workshop/core/training/ml_training.py.amltmp new file mode 100644 index 00000000..01028cdd --- /dev/null +++ b/src/workshop/core/training/ml_training.py.amltmp @@ -0,0 +1,103 @@ +import pandas as pd +import numpy as np +import os +import argparse +import mlflow +import mlflow.sklearn +from azureml.core import Run, Dataset,Datastore, Workspace +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import Ridge +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.compose import ColumnTransformer +from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error +import joblib +def parse_args(): + # arg parser + parser = argparse.ArgumentParser() + + parser.add_argument("--prep_data", default="data", type=str, help="Path to prepped data, default to local folder") + parser.add_argument("--model_folder", type=str,default="data", help="Path of model ouput folder, default to local folder") + parser.add_argument("--input_file_name", type=str, default="final_df.parquet") + parser.add_argument("--run_mode", type=str, default="local") + + + # parse args + args = parser.parse_args() + + # return args + return args + + +def createClassModel(algo_name, catg, nums): + numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]) + + categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="MISSING")), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) + + preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)]) + + if algo_name == 'linear_regression': + #--------------------------------------------- + #setup: Update alpha value + #--------------------------------------------- + model = Ridge(alpha=100) #setup + elif algo_name == 'random_forest': + model = RandomForestRegressor() + else: + pass + + ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), ("model", model)]) + + return ModelPipeline + +def main(args): + + # read in data + final_df = pd.read_parquet(os.path.join(args.prep_data,args.input_file_name)) + catg_cols = ["vendorID", "month_num", "day_of_month", "normalizeHolidayName", "isPaidTimeOff"] + num_cols = ["passengerCount", "tripDistance", "precipTime", "temperature", "precipDepth", "hr_sin", "hr_cos", "dy_sin", "dy_cos"] + label = ["totalAmount"] + # make sure categorical columns are strings + final_df[catg_cols] = final_df[catg_cols].astype("str") + + # split data + X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222) + + # test 2 algorithms + os.makedirs(args.model_folder, exist_ok=True) + + algorithmname = "linear_regression" + fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline + fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine + + y_pred = fitPipeline.predict(X_test) # score with fitted pipeline + + # Evaluate + r2 = r2_score(y_test, y_pred) + mape = mean_absolute_percentage_error(y_test, y_pred) + rmse = np.sqrt(mean_squared_error(y_test, y_pred)) + + + joblib.dump(fitPipeline,args.model_folder+"/"+algorithmname+".joblib") + + print("Training finished!. Metrics:") + print(f"R2_{algorithmname}", r2) + print(f"MAPE_{algorithmname}", mape) + print(f"RMSE_{algorithmname}", rmse) + print("Model",args.model_folder+"/"+algorithmname+".joblib","saved!") + + if args.run_mode == 'remote': + mlflow.log_metric(f"R2_{algorithmname}", r2) + mlflow.log_metric(f"MAPE_{algorithmname}", mape) + mlflow.log_metric(f"RMSE_{algorithmname}", rmse) + mlflow.sklearn.log_model(fitPipeline,f"{algorithmname}_model") + +# run script +if __name__ == "__main__": + # parse args + args = parse_args() + # run main function + main(args) \ No newline at end of file diff --git a/src/workshop/core/training/ml_training.yml b/src/workshop/core/training/ml_training.yml index 74685a13..7b5e8d71 100644 --- a/src/workshop/core/training/ml_training.yml +++ b/src/workshop/core/training/ml_training.yml @@ -1,26 +1,26 @@ -$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json -code: ./ -command: >- - python ml_training.py - --prep_data ${{inputs.prep_data}} - --model_folder ${{outputs.model_folder}} - --run_mode ${{inputs.run_mode}} - -inputs: - prep_data: - type: uri_folder - path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ - run_mode: "remote" - -outputs: - model_folder: - type: uri_folder - path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ - -environment: - conda_file: ./conda_ml_training.yml - image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest -compute: azureml:cpu-cluster -display_name: ml-training -experiment_name: ml-training -description: ml-training +$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json +code: ./ +command: >- + python ml_training.py + --prep_data ${{inputs.prep_data}} + --model_folder ${{outputs.model_folder}} + --run_mode ${{inputs.run_mode}} + +inputs: + prep_data: + type: uri_folder + path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ + run_mode: "remote" + +outputs: + model_folder: + type: uri_folder + path: azureml://datastores/workspaceblobstore/paths/mlops_workshop_data/ + +environment: + conda_file: ./conda_ml_training.yml + image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest +compute: azureml:cpu-cluster +display_name: ml-training +experiment_name: ml-training +description: ml-training diff --git a/src/workshop/data/.amlignore b/src/workshop/data/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/data/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/data/.amlignore.amltmp b/src/workshop/data/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/data/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/data/create_datasets.py b/src/workshop/data/create_datasets.py index 47869577..6cafc3ef 100644 --- a/src/workshop/data/create_datasets.py +++ b/src/workshop/data/create_datasets.py @@ -1,169 +1,169 @@ -from azureml.opendatasets import NycTlcGreen -from azureml.opendatasets import PublicHolidays -from azureml.opendatasets import NoaaIsdWeather -import os -import pandas as pd -from datetime import datetime -from dateutil.relativedelta import relativedelta -import argparse -from azureml.core import Run, Dataset,Datastore, Workspace -import shutil -from sklearn.model_selection import train_test_split -import numpy as np -def parse_args(): - # setup arg parser - parser = argparse.ArgumentParser() - - # add arguments - parser.add_argument("--year", type=str,default=2016) - parser.add_argument("--sample_size", type=str, default=2000) - parser.add_argument("--nyc_dataset_name", type=str, default="NycTlcGreen") - parser.add_argument("--public_holiday_dataset_name", type=str, default="PublicHoliday") - parser.add_argument("--weather_dataset_name", type=str, default="Weather") - - parser.add_argument("--datastore_name", type=str, default="workspaceblobstore") - parser.add_argument("--ml_workspace_name", type=str, default=None) - parser.add_argument("--sub_id", type=str, default=None) - parser.add_argument("--resourcegroup_name", type=str, default=None) - - # parse args - args = parser.parse_args() - - # return args - return args - -def build_time_features(vector): - pickup_datetime = vector[0] - month_num = pickup_datetime.month - day_of_month = pickup_datetime.day - day_of_week = pickup_datetime.weekday() - hour_of_day = pickup_datetime.hour - country_code = "US" - hr_sin = np.sin(hour_of_day*(2.*np.pi/24)) - hr_cos = np.cos(hour_of_day*(2.*np.pi/24)) - dy_sin = np.sin(day_of_week*(2.*np.pi/7)) - dy_cos = np.cos(day_of_week*(2.*np.pi/7)) - - return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos)) - -def create_ml_dataset(green_taxi_df,holidays_df,weather_df): - - green_taxi_df[["month_num", "day_of_month","day_of_week", "hour_of_day", "country_code", "hr_sin", "hr_cos", "dy_sin", "dy_cos"]] = \ - green_taxi_df[["lpepPickupDatetime"]].apply(build_time_features, axis=1) - - columns_to_remove = ["lpepDropoffDatetime", "puLocationId", "doLocationId", "extra", "mtaTax", - "improvementSurcharge", "tollsAmount", "ehailFee", "tripType", "rateCodeID", - "storeAndFwdFlag", "paymentType", "fareAmount", "tipAmount"] - - green_taxi_df.drop(columns_to_remove, axis=1, inplace=True) - - - green_taxi_df["datetime"] = green_taxi_df["lpepPickupDatetime"].dt.normalize() - - - - holidays_df = holidays_df.rename(columns={"countryRegionCode": "country_code"}) - holidays_df["datetime"] = holidays_df["date"].dt.normalize() - - holidays_df.drop(["countryOrRegion", "holidayName", "date"], axis=1, inplace=True) - - taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how="left", on=["datetime", "country_code"]) - taxi_holidays_df[taxi_holidays_df["normalizeHolidayName"].notnull()] - - - weather_df["datetime"] = weather_df["datetime"].dt.normalize() - - # group by datetime - aggregations = {"precipTime": "max", "temperature": "mean", "precipDepth": "max"} - weather_df_grouped = weather_df.groupby("datetime").agg(aggregations) - weather_df_grouped.head(10) - - taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how="left", on=["datetime"]) - # taxi_holidays_weather_df.describe() - - final_df = taxi_holidays_weather_df.query("pickupLatitude>=40.53 and pickupLatitude<=40.88 and \ - pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \ - tripDistance>0 and tripDistance<75 and \ - passengerCount>0 and passengerCount<100 and \ - totalAmount>0") - return final_df - -def main(args): - # check aml workspace - # read in data - os.makedirs("data/.tmp", exist_ok=True) - if (args.ml_workspace_name == None): - print("Please provide your AML Workspace Name") - print("Example:") - print("create_datasets.py --datastore_name workspaceblobstore --ml_workspace_name amlwrkshp-000 --sub_id SUBSCRIPTIONID --resourcegroup_name amlwrkshp-000-rg") - return 0 - elif (args.sub_id == None): - print("lease provide your Subscription ID") - print("Example:") - print("create_datasets.py --datastore_name workspaceblobstore --ml_workspace_name amlwrkshp-000 --sub_id SUBSCRIPTIONID --resourcegroup_name amlwrkshp-000-rg") - return 0 - if (args.resourcegroup_name == None): - print("lease provide your Resource Group Name") - print("Example:") - print("create_datasets.py --datastore_name workspaceblobstore --ml_workspace_name amlwrkshp-000 --sub_id SUBSCRIPTIONID --resourcegroup_name amlwrkshp-000-rg") - return 0 - else: - amlName, subId, rgName = args.ml_workspace_name, args.sub_id, args.resourcegroup_name - print("Accessing your AML workspace {0} in {1}".format(amlName, rgName)) - - ws = Workspace.get(name=amlName, subscription_id=subId, resource_group=rgName) - - print(ws) - datastore= ws.datastores[args.datastore_name] - start = datetime.strptime(f"1/1/{args.year}","%m/%d/%Y") - end = datetime.strptime(f"1/31/{args.year}","%m/%d/%Y") - - green_taxi_df = pd.concat([NycTlcGreen(start + relativedelta(months=x), end + relativedelta(months=x)) \ - .to_pandas_dataframe().sample(args.sample_size) for x in range(12)]) - green_taxi_df.to_parquet("data/.tmp/green_taxi.parquet") - green_taxi_df_ut= green_taxi_df.sample(1000) #small file for local testing - green_taxi_df_ut.to_parquet("data/green_taxi.parquet") - - - - # Public holidays - holidays_df = PublicHolidays().to_pandas_dataframe() - holidays_df.to_parquet("data/.tmp/holidays.parquet") - #local dataset - holidays_df_ut = holidays_df.sample(1000) - holidays_df_ut.to_parquet("data/holidays.parquet") - - - weather_df = pd.concat([NoaaIsdWeather(cols=["temperature", "precipTime", "precipDepth"], start_date=start + relativedelta(months=x), end_date=end + relativedelta(months=x))\ - .to_pandas_dataframe().query("latitude>=40.53 and latitude<=40.88 and longitude>=-74.09 and longitude<=-73.72 and temperature==temperature") for x in range(12)]) - weather_df.to_parquet("data/.tmp/weather.parquet") - - weather_df_ut = weather_df.sample(1000)#small file for local testing - weather_df_ut.to_parquet("data/weather.parquet") - - final_df = create_ml_dataset(green_taxi_df,holidays_df,weather_df) - final_df_ut = final_df.sample(1000) - final_df_ut, test_df_ut = train_test_split(final_df_ut, test_size=0.2, random_state=100) - final_df, test_df = train_test_split(final_df, test_size=0.2, random_state=100) - - final_df_ut.to_parquet("data/final_df.parquet") - test_df_ut.to_parquet("data/test_df.parquet") - - final_df.to_parquet("data/.tmp/final_df.parquet") - test_df.to_parquet("data/.tmp/test_df.parquet") - shutil.copy('data/linear_regression.joblib','data/.tmp/linear_regression.joblib') - - #also uploading to cloud for remote job run - datastore.upload(src_dir='data/.tmp', - target_path='mlops_workshop_data', - overwrite=True) - shutil.rmtree('data/.tmp') - -# run script -if __name__ == "__main__": - # parse args - print("Running script to create datasets") - args = parse_args() - - # run main function +from azureml.opendatasets import NycTlcGreen +from azureml.opendatasets import PublicHolidays +from azureml.opendatasets import NoaaIsdWeather +import os +import pandas as pd +from datetime import datetime +from dateutil.relativedelta import relativedelta +import argparse +from azureml.core import Run, Dataset,Datastore, Workspace +import shutil +from sklearn.model_selection import train_test_split +import numpy as np +def parse_args(): + # setup arg parser + parser = argparse.ArgumentParser() + + # add arguments + parser.add_argument("--year", type=str,default=2016) + parser.add_argument("--sample_size", type=str, default=2000) + parser.add_argument("--nyc_dataset_name", type=str, default="NycTlcGreen") + parser.add_argument("--public_holiday_dataset_name", type=str, default="PublicHoliday") + parser.add_argument("--weather_dataset_name", type=str, default="Weather") + + parser.add_argument("--datastore_name", type=str, default="workspaceblobstore") + parser.add_argument("--ml_workspace_name", type=str, default=None) + parser.add_argument("--sub_id", type=str, default=None) + parser.add_argument("--resourcegroup_name", type=str, default=None) + + # parse args + args = parser.parse_args() + + # return args + return args + +def build_time_features(vector): + pickup_datetime = vector[0] + month_num = pickup_datetime.month + day_of_month = pickup_datetime.day + day_of_week = pickup_datetime.weekday() + hour_of_day = pickup_datetime.hour + country_code = "US" + hr_sin = np.sin(hour_of_day*(2.*np.pi/24)) + hr_cos = np.cos(hour_of_day*(2.*np.pi/24)) + dy_sin = np.sin(day_of_week*(2.*np.pi/7)) + dy_cos = np.cos(day_of_week*(2.*np.pi/7)) + + return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos)) + +def create_ml_dataset(green_taxi_df,holidays_df,weather_df): + + green_taxi_df[["month_num", "day_of_month","day_of_week", "hour_of_day", "country_code", "hr_sin", "hr_cos", "dy_sin", "dy_cos"]] = \ + green_taxi_df[["lpepPickupDatetime"]].apply(build_time_features, axis=1) + + columns_to_remove = ["lpepDropoffDatetime", "puLocationId", "doLocationId", "extra", "mtaTax", + "improvementSurcharge", "tollsAmount", "ehailFee", "tripType", "rateCodeID", + "storeAndFwdFlag", "paymentType", "fareAmount", "tipAmount"] + + green_taxi_df.drop(columns_to_remove, axis=1, inplace=True) + + + green_taxi_df["datetime"] = green_taxi_df["lpepPickupDatetime"].dt.normalize() + + + + holidays_df = holidays_df.rename(columns={"countryRegionCode": "country_code"}) + holidays_df["datetime"] = holidays_df["date"].dt.normalize() + + holidays_df.drop(["countryOrRegion", "holidayName", "date"], axis=1, inplace=True) + + taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how="left", on=["datetime", "country_code"]) + taxi_holidays_df[taxi_holidays_df["normalizeHolidayName"].notnull()] + + + weather_df["datetime"] = weather_df["datetime"].dt.normalize() + + # group by datetime + aggregations = {"precipTime": "max", "temperature": "mean", "precipDepth": "max"} + weather_df_grouped = weather_df.groupby("datetime").agg(aggregations) + weather_df_grouped.head(10) + + taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how="left", on=["datetime"]) + # taxi_holidays_weather_df.describe() + + final_df = taxi_holidays_weather_df.query("pickupLatitude>=40.53 and pickupLatitude<=40.88 and \ + pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \ + tripDistance>0 and tripDistance<75 and \ + passengerCount>0 and passengerCount<100 and \ + totalAmount>0") + return final_df + +def main(args): + # check aml workspace + # read in data + os.makedirs("data/.tmp", exist_ok=True) + if (args.ml_workspace_name == None): + print("Please provide your AML Workspace Name") + print("Example:") + print("create_datasets.py --datastore_name workspaceblobstore --ml_workspace_name amlwrkshp-000 --sub_id SUBSCRIPTIONID --resourcegroup_name amlwrkshp-000-rg") + return 0 + elif (args.sub_id == None): + print("lease provide your Subscription ID") + print("Example:") + print("create_datasets.py --datastore_name workspaceblobstore --ml_workspace_name amlwrkshp-000 --sub_id SUBSCRIPTIONID --resourcegroup_name amlwrkshp-000-rg") + return 0 + if (args.resourcegroup_name == None): + print("lease provide your Resource Group Name") + print("Example:") + print("create_datasets.py --datastore_name workspaceblobstore --ml_workspace_name amlwrkshp-000 --sub_id SUBSCRIPTIONID --resourcegroup_name amlwrkshp-000-rg") + return 0 + else: + amlName, subId, rgName = args.ml_workspace_name, args.sub_id, args.resourcegroup_name + print("Accessing your AML workspace {0} in {1}".format(amlName, rgName)) + + ws = Workspace.get(name=amlName, subscription_id=subId, resource_group=rgName) + + print(ws) + datastore= ws.datastores[args.datastore_name] + start = datetime.strptime(f"1/1/{args.year}","%m/%d/%Y") + end = datetime.strptime(f"1/31/{args.year}","%m/%d/%Y") + + green_taxi_df = pd.concat([NycTlcGreen(start + relativedelta(months=x), end + relativedelta(months=x)) \ + .to_pandas_dataframe().sample(args.sample_size) for x in range(12)]) + green_taxi_df.to_parquet("data/.tmp/green_taxi.parquet") + green_taxi_df_ut= green_taxi_df.sample(1000) #small file for local testing + green_taxi_df_ut.to_parquet("data/green_taxi.parquet") + + + + # Public holidays + holidays_df = PublicHolidays().to_pandas_dataframe() + holidays_df.to_parquet("data/.tmp/holidays.parquet") + #local dataset + holidays_df_ut = holidays_df.sample(1000) + holidays_df_ut.to_parquet("data/holidays.parquet") + + + weather_df = pd.concat([NoaaIsdWeather(cols=["temperature", "precipTime", "precipDepth"], start_date=start + relativedelta(months=x), end_date=end + relativedelta(months=x))\ + .to_pandas_dataframe().query("latitude>=40.53 and latitude<=40.88 and longitude>=-74.09 and longitude<=-73.72 and temperature==temperature") for x in range(12)]) + weather_df.to_parquet("data/.tmp/weather.parquet") + + weather_df_ut = weather_df.sample(1000)#small file for local testing + weather_df_ut.to_parquet("data/weather.parquet") + + final_df = create_ml_dataset(green_taxi_df,holidays_df,weather_df) + final_df_ut = final_df.sample(1000) + final_df_ut, test_df_ut = train_test_split(final_df_ut, test_size=0.2, random_state=100) + final_df, test_df = train_test_split(final_df, test_size=0.2, random_state=100) + + final_df_ut.to_parquet("data/final_df.parquet") + test_df_ut.to_parquet("data/test_df.parquet") + + final_df.to_parquet("data/.tmp/final_df.parquet") + test_df.to_parquet("data/.tmp/test_df.parquet") + shutil.copy('data/linear_regression.joblib','data/.tmp/linear_regression.joblib') + + #also uploading to cloud for remote job run + datastore.upload(src_dir='data/.tmp', + target_path='mlops_workshop_data', + overwrite=True) + shutil.rmtree('data/.tmp') + +# run script +if __name__ == "__main__": + # parse args + print("Running script to create datasets") + args = parse_args() + + # run main function main(args) \ No newline at end of file diff --git a/src/workshop/data/linear_regression.joblib b/src/workshop/data/linear_regression.joblib index 52e4e858..5a071e13 100644 Binary files a/src/workshop/data/linear_regression.joblib and b/src/workshop/data/linear_regression.joblib differ diff --git a/src/workshop/documents/.amlignore b/src/workshop/documents/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/documents/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/documents/.amlignore.amltmp b/src/workshop/documents/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/documents/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/documents/IaC/.amlignore b/src/workshop/documents/IaC/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/documents/IaC/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/documents/IaC/.amlignore.amltmp b/src/workshop/documents/IaC/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/documents/IaC/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/documents/IaC/createSP.azcli b/src/workshop/documents/IaC/createSP.azcli index 36d2d5af..153fbbbf 100644 --- a/src/workshop/documents/IaC/createSP.azcli +++ b/src/workshop/documents/IaC/createSP.azcli @@ -1,105 +1,105 @@ -#!/bin/bash - -echo "This script will help you to create Azure Resources for EZ-MLOps workshop." -echo "For your information following Azure resources will be create in a Resource Group level" -echo "" -echo "* Service Principal" -echo "" - -read -p "0. Please hit Enter to run the script >>" - -# Select Azure subscription -az account list --output table - -echo "" - -read -p "1. TYPE your subscription Name for this workshop case-sensitive>>" subName - -if [ ! -z "$subName" ]; then - echo "You select " $subName " for the workshop." - az account set --subscription "$subName" --verbose - subscriptionID=$(az account show --query id -o tsv) - echo $subscriptionID -else - echo "Please run the script again!! EXIT" - exit -fi - -chkName=$(az account list --output tsv --query "[?isDefault].name") - -if [ "$subName" = "$chkName" ]; then - echo "" - echo "Subscripion Name has confirmed" - echo "" -else - echo "Please try again with correct subscription name" - echo "EXIT" - exit -fi - -echo "" - -az account list-locations --output table --query []['name','displayName'] -echo "" -echo "2. Type location for the lab" -read -p "Location >>" loc - -# set azure region -if [ ! -z "$loc" ]; then - echo "You set location" $loc " for the lab." -else - echo "Default location is East US 2" - loc=eastus2 -fi - -echo "" -echo "3. What is your Resource Group Name" -read -p "Resource Group Name >>" rgName - -# set azure region -if [ ! -z "$rgName" ]; then - echo "You set Resource Group Name" $rgName " for the lab." -else - echo "Please try again with correct Resource Group name" - echo "EXIT" - exit -fi - -chkrgName=$(az group list --output tsv --query "[?name=='$rgName)'].name" ) - -if [ "$rgName" = "$chkrgName" ]; then - echo "" - echo "Resource Group Name has confirmed" - echo "" -else - echo "Please try again with correct subscription name" - echo "EXIT" - exit -fi - - -# Create Service Principal -# https://docs.microsoft.com/en-us/cli/azure/create-an-azure-service-principal-azure-cli - -let "randomIdentifier=$RANDOM*$RANDOM" -servicePrincipalName="ezmlops-$randomIdentifier" -roleName="contributor" -# Verify the ID of the active subscription -echo "Using subscription ID $subscriptionID" -echo "" -echo "Creating SP for RBAC with name $servicePrincipalName," -echo "" -echo "with role $roleName" -echo "" -echo "and in scopes /subscriptions/$subscriptionID/resourceGroups/$rgName" -echo "" -echo "If you fail this step, you cannot move on to the next step" -echo "" -az ad sp create-for-rbac --name $servicePrincipalName --role $roleName --scopes /subscriptions/$subscriptionID/resourceGroups/$rgName > sp.txt -echo "" -echo "*************************************" -echo "Information about the Service Principal is captured in the file ./sp.txt" -echo "*************************************" -echo "" -cat ./sp.txt -echo "" +#!/bin/bash + +echo "This script will help you to create Azure Resources for EZ-MLOps workshop." +echo "For your information following Azure resources will be create in a Resource Group level" +echo "" +echo "* Service Principal" +echo "" + +read -p "0. Please hit Enter to run the script >>" + +# Select Azure subscription +az account list --output table + +echo "" + +read -p "1. TYPE your subscription Name for this workshop case-sensitive>>" subName + +if [ ! -z "$subName" ]; then + echo "You select " $subName " for the workshop." + az account set --subscription "$subName" --verbose + subscriptionID=$(az account show --query id -o tsv) + echo $subscriptionID +else + echo "Please run the script again!! EXIT" + exit +fi + +chkName=$(az account list --output tsv --query "[?isDefault].name") + +if [ "$subName" = "$chkName" ]; then + echo "" + echo "Subscripion Name has confirmed" + echo "" +else + echo "Please try again with correct subscription name" + echo "EXIT" + exit +fi + +echo "" + +az account list-locations --output table --query []['name','displayName'] +echo "" +echo "2. Type location for the lab" +read -p "Location >>" loc + +# set azure region +if [ ! -z "$loc" ]; then + echo "You set location" $loc " for the lab." +else + echo "Default location is East US 2" + loc=eastus2 +fi + +echo "" +echo "3. What is your Resource Group Name" +read -p "Resource Group Name >>" rgName + +# set azure region +if [ ! -z "$rgName" ]; then + echo "You set Resource Group Name" $rgName " for the lab." +else + echo "Please try again with correct Resource Group name" + echo "EXIT" + exit +fi + +chkrgName=$(az group list --output tsv --query "[?name=='$rgName)'].name" ) + +if [ "$rgName" = "$chkrgName" ]; then + echo "" + echo "Resource Group Name has confirmed" + echo "" +else + echo "Please try again with correct subscription name" + echo "EXIT" + exit +fi + + +# Create Service Principal +# https://docs.microsoft.com/en-us/cli/azure/create-an-azure-service-principal-azure-cli + +let "randomIdentifier=$RANDOM*$RANDOM" +servicePrincipalName="ezmlops-$randomIdentifier" +roleName="contributor" +# Verify the ID of the active subscription +echo "Using subscription ID $subscriptionID" +echo "" +echo "Creating SP for RBAC with name $servicePrincipalName," +echo "" +echo "with role $roleName" +echo "" +echo "and in scopes /subscriptions/$subscriptionID/resourceGroups/$rgName" +echo "" +echo "If you fail this step, you cannot move on to the next step" +echo "" +az ad sp create-for-rbac --name $servicePrincipalName --role $roleName --scopes /subscriptions/$subscriptionID/resourceGroups/$rgName > sp.txt +echo "" +echo "*************************************" +echo "Information about the Service Principal is captured in the file ./sp.txt" +echo "*************************************" +echo "" +cat ./sp.txt +echo "" diff --git a/src/workshop/documents/IaC/iac_EZ_MLOps.json b/src/workshop/documents/IaC/iac_EZ_MLOps.json index e0af8333..15c6d916 100644 --- a/src/workshop/documents/IaC/iac_EZ_MLOps.json +++ b/src/workshop/documents/IaC/iac_EZ_MLOps.json @@ -1,177 +1,177 @@ -{ - "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "parameters": { - "name": { - "type": "string", - "minLength": 5, - "maxLength": 8, - "metadata": { - "description": "Specifies the name of the deployment." - } - }, - "vmSize": { - "type": "string", - "allowedValues": [ - "Standard_DS2_v2", - "Standard_DS3", - "Standard_DS3_v2", - "Standard_DS4", - "Standard_DS4_v2" - ], - "defaultValue": "Standard_DS3_v2", - "metadata": { - "description": "Choose VM size for computes" - } - }, - "location": { - "type": "string", - "allowedValues": [ - "centralus", - "eastus", - "eastus2", - "southcentralus", - "westcentralus", - "westus" - ], - "defaultValue": "eastus2", - "metadata": { - "description": "Specifies the location of the Azure Machine Learning workspace and dependent resources." - } - } - }, - "variables": { - "tenantId": "[subscription().tenantId]", - "storageAccountName": "[concat(parameters('name'),'store')]", - "keyVaultName": "[concat(parameters('name'),'akv')]", - "applicationInsightsName": "[concat(parameters('name'),'appi')]", - "containerRegistryName": "[concat(parameters('name'),'acr')]", - "workspaceName": "[concat(parameters('name'),'aml')]", - "storageAccount": "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName'))]", - "keyVault": "[resourceId('Microsoft.KeyVault/vaults', variables('keyVaultName'))]", - "applicationInsights": "[resourceId('Microsoft.Insights/components', variables('applicationInsightsName'))]", - "containerRegistry": "[resourceId('Microsoft.ContainerRegistry/registries', variables('containerRegistryName'))]", - "amlciName": "[concat(parameters('name'), 'i', substring(uniqueString(resourceGroup().id),1,3))]", - "amlccName": "[concat(parameters('name'), 'c', substring(uniqueString(resourceGroup().id),1,3))]" - }, - "resources": [ - { - "type": "Microsoft.Storage/storageAccounts", - "apiVersion": "2021-01-01", - "name": "[variables('storageAccountName')]", - "location": "[parameters('location')]", - "sku": { - "name": "Standard_LRS" - }, - "kind": "StorageV2", - "properties": { - "encryption": { - "services": { - "blob": { - "enabled": true - }, - "file": { - "enabled": true - } - }, - "keySource": "Microsoft.Storage" - }, - "supportsHttpsTrafficOnly": true - } - }, - { - "type": "Microsoft.KeyVault/vaults", - "apiVersion": "2021-04-01-preview", - "name": "[variables('keyVaultName')]", - "location": "[parameters('location')]", - "properties": { - "tenantId": "[variables('tenantId')]", - "sku": { - "name": "standard", - "family": "A" - }, - "accessPolicies": [], - "enableSoftDelete": true - } - }, - { - "type": "Microsoft.Insights/components", - "apiVersion": "2020-02-02", - "name": "[variables('applicationInsightsName')]", - "location": "[if(or(equals(parameters('location'),'eastus2'), equals(parameters('location'),'westcentralus')),'southcentralus',parameters('location'))]", - "kind": "web", - "properties": { - "Application_Type": "web" - } - }, - { - "type": "Microsoft.ContainerRegistry/registries", - "sku": { - "name": "Standard", - "tier": "Standard" - }, - "name": "[variables('containerRegistryName')]", - "apiVersion": "2019-12-01-preview", - "location": "[parameters('location')]", - "properties": { - "adminUserEnabled": true - } - }, - { - "type": "Microsoft.MachineLearningServices/workspaces", - "apiVersion": "2020-03-01", - "identity": { - "type": "systemAssigned" - }, - "name": "[variables('workspaceName')]", - "location": "[parameters('location')]", - "dependsOn": [ - "[variables('storageAccount')]", - "[variables('keyVault')]", - "[variables('applicationInsights')]", - "[variables('containerRegistry')]" - ], - "properties": { - "friendlyName": "[variables('workspaceName')]", - "storageAccount": "[variables('storageAccount')]", - "keyVault": "[variables('keyVault')]", - "applicationInsights": "[variables('applicationInsights')]", - "containerRegistry": "[variables('containerRegistry')]" - } - }, - { - "type": "Microsoft.MachineLearningServices/workspaces/computes", - "name": "[concat(variables('workspaceName'), '/', variables('amlciName'))]", - "apiVersion": "2021-07-01", - "location": "[parameters('location')]", - "dependsOn": [ - "[resourceId('Microsoft.MachineLearningServices/workspaces', variables('workspaceName'))]" - ], - "properties": { - "computeType": "ComputeInstance", - "properties": { - "vmSize": "[parameters('vmSize')]" - } - } - }, - { - "type": "Microsoft.MachineLearningServices/workspaces/computes", - "name": "[concat(variables('workspaceName'), '/', variables('amlccName'))]", - "apiVersion": "2021-01-01", - "location": "[parameters('location')]", - "dependsOn": [ - "[resourceId('Microsoft.MachineLearningServices/workspaces', variables('workspaceName'))]" - ], - "properties": { - "computeType": "AmlCompute", - "properties": { - "vmSize": "[parameters('vmSize')]", - "scaleSettings": { - "minNodeCount": "0", - "maxNodeCount": "1" - } - } - } - } - ] -} +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "name": { + "type": "string", + "minLength": 5, + "maxLength": 8, + "metadata": { + "description": "Specifies the name of the deployment." + } + }, + "vmSize": { + "type": "string", + "allowedValues": [ + "Standard_DS2_v2", + "Standard_DS3", + "Standard_DS3_v2", + "Standard_DS4", + "Standard_DS4_v2" + ], + "defaultValue": "Standard_DS3_v2", + "metadata": { + "description": "Choose VM size for computes" + } + }, + "location": { + "type": "string", + "allowedValues": [ + "centralus", + "eastus", + "eastus2", + "southcentralus", + "westcentralus", + "westus" + ], + "defaultValue": "eastus2", + "metadata": { + "description": "Specifies the location of the Azure Machine Learning workspace and dependent resources." + } + } + }, + "variables": { + "tenantId": "[subscription().tenantId]", + "storageAccountName": "[concat(parameters('name'),'store')]", + "keyVaultName": "[concat(parameters('name'),'akv')]", + "applicationInsightsName": "[concat(parameters('name'),'appi')]", + "containerRegistryName": "[concat(parameters('name'),'acr')]", + "workspaceName": "[concat(parameters('name'),'aml')]", + "storageAccount": "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName'))]", + "keyVault": "[resourceId('Microsoft.KeyVault/vaults', variables('keyVaultName'))]", + "applicationInsights": "[resourceId('Microsoft.Insights/components', variables('applicationInsightsName'))]", + "containerRegistry": "[resourceId('Microsoft.ContainerRegistry/registries', variables('containerRegistryName'))]", + "amlciName": "[concat(parameters('name'), 'i', substring(uniqueString(resourceGroup().id),1,3))]", + "amlccName": "[concat(parameters('name'), 'c', substring(uniqueString(resourceGroup().id),1,3))]" + }, + "resources": [ + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2021-01-01", + "name": "[variables('storageAccountName')]", + "location": "[parameters('location')]", + "sku": { + "name": "Standard_LRS" + }, + "kind": "StorageV2", + "properties": { + "encryption": { + "services": { + "blob": { + "enabled": true + }, + "file": { + "enabled": true + } + }, + "keySource": "Microsoft.Storage" + }, + "supportsHttpsTrafficOnly": true + } + }, + { + "type": "Microsoft.KeyVault/vaults", + "apiVersion": "2021-04-01-preview", + "name": "[variables('keyVaultName')]", + "location": "[parameters('location')]", + "properties": { + "tenantId": "[variables('tenantId')]", + "sku": { + "name": "standard", + "family": "A" + }, + "accessPolicies": [], + "enableSoftDelete": true + } + }, + { + "type": "Microsoft.Insights/components", + "apiVersion": "2020-02-02", + "name": "[variables('applicationInsightsName')]", + "location": "[if(or(equals(parameters('location'),'eastus2'), equals(parameters('location'),'westcentralus')),'southcentralus',parameters('location'))]", + "kind": "web", + "properties": { + "Application_Type": "web" + } + }, + { + "type": "Microsoft.ContainerRegistry/registries", + "sku": { + "name": "Standard", + "tier": "Standard" + }, + "name": "[variables('containerRegistryName')]", + "apiVersion": "2019-12-01-preview", + "location": "[parameters('location')]", + "properties": { + "adminUserEnabled": true + } + }, + { + "type": "Microsoft.MachineLearningServices/workspaces", + "apiVersion": "2020-03-01", + "identity": { + "type": "systemAssigned" + }, + "name": "[variables('workspaceName')]", + "location": "[parameters('location')]", + "dependsOn": [ + "[variables('storageAccount')]", + "[variables('keyVault')]", + "[variables('applicationInsights')]", + "[variables('containerRegistry')]" + ], + "properties": { + "friendlyName": "[variables('workspaceName')]", + "storageAccount": "[variables('storageAccount')]", + "keyVault": "[variables('keyVault')]", + "applicationInsights": "[variables('applicationInsights')]", + "containerRegistry": "[variables('containerRegistry')]" + } + }, + { + "type": "Microsoft.MachineLearningServices/workspaces/computes", + "name": "[concat(variables('workspaceName'), '/', variables('amlciName'))]", + "apiVersion": "2021-07-01", + "location": "[parameters('location')]", + "dependsOn": [ + "[resourceId('Microsoft.MachineLearningServices/workspaces', variables('workspaceName'))]" + ], + "properties": { + "computeType": "ComputeInstance", + "properties": { + "vmSize": "[parameters('vmSize')]" + } + } + }, + { + "type": "Microsoft.MachineLearningServices/workspaces/computes", + "name": "[concat(variables('workspaceName'), '/', variables('amlccName'))]", + "apiVersion": "2021-01-01", + "location": "[parameters('location')]", + "dependsOn": [ + "[resourceId('Microsoft.MachineLearningServices/workspaces', variables('workspaceName'))]" + ], + "properties": { + "computeType": "AmlCompute", + "properties": { + "vmSize": "[parameters('vmSize')]", + "scaleSettings": { + "minNodeCount": "0", + "maxNodeCount": "1" + } + } + } + } + ] +} diff --git a/src/workshop/documents/IaC/iac_cc.yml b/src/workshop/documents/IaC/iac_cc.yml index 206c231c..ad3b6a83 100644 --- a/src/workshop/documents/IaC/iac_cc.yml +++ b/src/workshop/documents/IaC/iac_cc.yml @@ -1,7 +1,7 @@ -$schema: https://azuremlschemas.azureedge.net/latest/amlCompute.schema.json -name: amlcc -type: amlcompute -size: STANDARD_DS3_v2 -min_instances: 0 -max_instances: 2 +$schema: https://azuremlschemas.azureedge.net/latest/amlCompute.schema.json +name: amlcc +type: amlcompute +size: STANDARD_DS3_v2 +min_instances: 0 +max_instances: 2 idle_time_before_scale_down: 120 \ No newline at end of file diff --git a/src/workshop/documents/IaC/iac_ci.yml b/src/workshop/documents/IaC/iac_ci.yml index 4687d04d..fbe062bc 100644 --- a/src/workshop/documents/IaC/iac_ci.yml +++ b/src/workshop/documents/IaC/iac_ci.yml @@ -1,4 +1,4 @@ -$schema: https://azuremlschemas.azureedge.net/latest/computeInstance.schema.json -name: amlci -type: computeinstance +$schema: https://azuremlschemas.azureedge.net/latest/computeInstance.schema.json +name: amlci +type: computeinstance size: STANDARD_DS3_v2 \ No newline at end of file diff --git a/src/workshop/documents/IaC/iac_mlopsworkshop.azcli b/src/workshop/documents/IaC/iac_mlopsworkshop.azcli index edb76b09..d479f88f 100644 --- a/src/workshop/documents/IaC/iac_mlopsworkshop.azcli +++ b/src/workshop/documents/IaC/iac_mlopsworkshop.azcli @@ -1,103 +1,103 @@ -#!/bin/bash - -echo "This script will help you to create Azure Resources for MLOps workshop." -echo "For your information following Azure resources will be create in a Resource Group:" -echo "" -echo "* Azure Resource Group" -echo "* Azure Machine Learning Service" -echo " - Blob Storage Account" -echo " - Azure Key Vault" -echo " - Azure Container Registry" -echo " - Application Insight" -echo " - 1 Compute Instance" -echo " - 1 Compute Cluster" -echo "* Service Principal" -echo "" - -read -p "0. Please hit Enter to run the script >>" - -# Select Azure subscription -az account list --output table - -echo "" - -read -p "1. TYPE your subscription Name for this workshop case-sensitive>>" subName - -if [ ! -z "$subName" ]; then - echo "You select " $subName " for the workshop." - az account set --subscription "$subName" --verbose - subscriptionID=$(az account show --query id -o tsv) - echo $subscriptionID -else - echo "Please run the script again!! EXIT" - exit -fi - -chkName=$(az account list --output tsv --query "[?isDefault].name") - -if [ "$subName" = "$chkName" ]; then - echo "Subscripion Name has confirmed" -else - echo "Please try again with correct subscription name" - exit -fi - -echo "" - -az account list-locations --output table --query []['name','displayName'] -echo "" -echo "2. Type location for the lab" -read -p "Location >>" loc - -# set azure region -if [ ! -z "$loc" ]; then - echo "You set location" $loc " for the lab." -else - echo "Default location is West US 2" - loc=westus2 -fi - -# if you have exsiting one please use the one -num=$(shuf -i0-1000 -n1) -rgName=amlwrkshp-$num-rg #Save it as ps1 -amlName=amlwrkshp-$num -ciName=amlci$num -echo $rgName - -echo "Creating Resource Group" -# Create Resource Group -az group create -n $rgName -l $loc --tags 'owner=workshop' 'environment=workshop' 'deleteme=afterworkshop' - -echo "Creating Azure Machine Learning Service" -# Create aml workspace -az ml workspace create -g $rgName -n $amlName - -echo "Creating Compute Instance in your $amlName Azure Machine Learning Workspace" -# Create Compute Instance -az ml compute create --name amlci$num --size STANDARD_DS11_V2 --type ComputeInstance --resource-group $rgName --workspace-name $amlName - -echo "Creating Compute Cluster in your $amlName Azure Machine Learning Workspace" -# Create Comput Cluster -az ml compute create --name amlcc$num --size STANDARD_DS11_V2 --min-instances 0 --max-instances 2 --type AmlCompute --resource-group $rgName --workspace-name $amlName - -# Create Service Principal -# https://docs.microsoft.com/en-us/cli/azure/create-an-azure-service-principal-azure-cli - -let "randomIdentifier=$RANDOM*$RANDOM" -servicePrincipalName="mlops-sp-$randomIdentifier" -roleName="contributor" -# Verify the ID of the active subscription -echo "Using subscription ID $subscriptionID" -echo "" -echo "Creating SP for RBAC with name $servicePrincipalName," -echo "with role $roleName" -echo "and in scopes /subscriptions/$subscriptionID/resourceGroups/$resourceGroup" -echo "" -az ad sp create-for-rbac --name $servicePrincipalName --role $roleName --scopes /subscriptions/$subscriptionID/resourceGroups/$rgName > sp.txt -echo "" -echo "*************************************" -echo "Information about the Service Principal is captured in the file ./sp.txt" -echo "*************************************" -echo "" -cat ./sp.txt -echo "" +#!/bin/bash + +echo "This script will help you to create Azure Resources for MLOps workshop." +echo "For your information following Azure resources will be create in a Resource Group:" +echo "" +echo "* Azure Resource Group" +echo "* Azure Machine Learning Service" +echo " - Blob Storage Account" +echo " - Azure Key Vault" +echo " - Azure Container Registry" +echo " - Application Insight" +echo " - 1 Compute Instance" +echo " - 1 Compute Cluster" +echo "* Service Principal" +echo "" + +read -p "0. Please hit Enter to run the script >>" + +# Select Azure subscription +az account list --output table + +echo "" + +read -p "1. TYPE your subscription Name for this workshop case-sensitive>>" subName + +if [ ! -z "$subName" ]; then + echo "You select " $subName " for the workshop." + az account set --subscription "$subName" --verbose + subscriptionID=$(az account show --query id -o tsv) + echo $subscriptionID +else + echo "Please run the script again!! EXIT" + exit +fi + +chkName=$(az account list --output tsv --query "[?isDefault].name") + +if [ "$subName" = "$chkName" ]; then + echo "Subscripion Name has confirmed" +else + echo "Please try again with correct subscription name" + exit +fi + +echo "" + +az account list-locations --output table --query []['name','displayName'] +echo "" +echo "2. Type location for the lab" +read -p "Location >>" loc + +# set azure region +if [ ! -z "$loc" ]; then + echo "You set location" $loc " for the lab." +else + echo "Default location is West US 2" + loc=westus2 +fi + +# if you have exsiting one please use the one +num=$(shuf -i0-1000 -n1) +rgName=amlwrkshp-$num-rg #Save it as ps1 +amlName=amlwrkshp-$num +ciName=amlci$num +echo $rgName + +echo "Creating Resource Group" +# Create Resource Group +az group create -n $rgName -l $loc --tags 'owner=workshop' 'environment=workshop' 'deleteme=afterworkshop' + +echo "Creating Azure Machine Learning Service" +# Create aml workspace +az ml workspace create -g $rgName -n $amlName + +echo "Creating Compute Instance in your $amlName Azure Machine Learning Workspace" +# Create Compute Instance +az ml compute create --name amlci$num --size STANDARD_DS11_V2 --type ComputeInstance --resource-group $rgName --workspace-name $amlName + +echo "Creating Compute Cluster in your $amlName Azure Machine Learning Workspace" +# Create Comput Cluster +az ml compute create --name amlcc$num --size STANDARD_DS11_V2 --min-instances 0 --max-instances 2 --type AmlCompute --resource-group $rgName --workspace-name $amlName + +# Create Service Principal +# https://docs.microsoft.com/en-us/cli/azure/create-an-azure-service-principal-azure-cli + +let "randomIdentifier=$RANDOM*$RANDOM" +servicePrincipalName="mlops-sp-$randomIdentifier" +roleName="contributor" +# Verify the ID of the active subscription +echo "Using subscription ID $subscriptionID" +echo "" +echo "Creating SP for RBAC with name $servicePrincipalName," +echo "with role $roleName" +echo "and in scopes /subscriptions/$subscriptionID/resourceGroups/$resourceGroup" +echo "" +az ad sp create-for-rbac --name $servicePrincipalName --role $roleName --scopes /subscriptions/$subscriptionID/resourceGroups/$rgName > sp.txt +echo "" +echo "*************************************" +echo "Information about the Service Principal is captured in the file ./sp.txt" +echo "*************************************" +echo "" +cat ./sp.txt +echo "" diff --git a/src/workshop/documents/images/deploy-to-azure.svg b/src/workshop/documents/images/deploy-to-azure.svg index 61ec2669..7eeab675 100644 --- a/src/workshop/documents/images/deploy-to-azure.svg +++ b/src/workshop/documents/images/deploy-to-azure.svg @@ -1,67 +1,67 @@ - - - - - - image/svg+xml - - - - - - - - - - - + + + + + + image/svg+xml + + + + + + + + + + + diff --git a/src/workshop/documents/part_0.md b/src/workshop/documents/part_0.md index abab84be..05a8daf9 100644 --- a/src/workshop/documents/part_0.md +++ b/src/workshop/documents/part_0.md @@ -1,570 +1,570 @@ -# Part 0: Workshop Environment Setup -> Note: Read the Workshop scenario overview [here](https://github.com/microsoft/MLOpsTemplate/blob/main/src/workshop/README.md#workshop-scenario) - -## Goal -- Setup Azure ML workspace and components -- Setup github account, a personal access token and configure settings -- Setup local python development environment -- Generate and register data for the workshop -- Setup SP (Service Principal) - -## Pre-requisites for part 0 -- An Azure Account and Subscription -- An understanding of: - - Azure Subscriptions and Resource Groups - - Service Principals - - Github mechanics (creating an account, forking a repo, etc.) - -## Steps - -0. [Check list](./part_tips.md) - -1. [Create Azure Machine Learning resources in Azure](#1-Create-resources-in-Azure) - -2. [Setup Github account and settings](#2-Setup-github-account-and-settings) - -3. [Setup your development environment](#3-Choose-your-development-environment) - - Option A: Use CI as your local in AML - - Option B: Use your local machine (PC or MAC) - -4. [Configure secret in your Github account](#4-Configure-secret-in-your-Github-account) - - Create a Personal Accesss Token (PAT) - - Add a Service Principal (SP) to your repo in Github - -> Note: For a detailed video walk-through of the process, click on the video below to walk through Steps 0 to 3. -> [![VideoGuide](./images/video_img.png)](https://youtu.be/k9ExpebwR18) - - -## 1. Create resources in Azure - -> IMPORTANT: You can skip this section if you've been provided a subscription already setup by Microsoft. - ->Note: To create resources you need an `Owner` or `Contributor` role for your subscription or resource group. If you don't have either one of these roles, you cannot create any of the following resources. - -- 1.1 Open the following link in a new tab. - - [![Deploy to Azure](./images/deploy-to-azure.svg)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fmicrosoft%2FMLOpsTemplate%2Fmain%2Fsrc%2Fworkshop%2Fdocuments%2FIaC%2Fiac_EZ_MLOps.json) - -- 1.2 Fill out the rest and click the `Create` button at the bottom. - - ![](./images/arm000.png) - - > Note: The provisioning will take 4 mins to 5 mins. If you want to see the progress of the provisioning, you can click the 'Notification' icon. When the provisioning is done, leave the tab open (don't close it!). You can leave it and open a new tab in your browser for the next step. - ![](./images/arm001.png) - - > IMPORTANT: If this deployment fails, you cannot do the following steps in the workshop. Please inform your CSA or lab instructor with the error message. - -## 2. Setup Github Account and Settings - -- 2.1 From the new browser tab, go to [Github](https://github.com/) and login to your account. - > Note: If you don't have an account for Github, please sign up. The workshop can't be done without a Github account. - -- 2.2 After the login, go to [https://github.com/microsoft/MLOpsTemplate](https://github.com/microsoft/MLOpsTemplate) and click `Fork`. - ![](./images/run_mlopsworkshop_azcli009.png) - - > Note: You will have the same repository (`MLOpsTemplate`) under your Github account name. - > Leave the tab open and **do not** close it yet. You will come back to your repository. - -## 3. Choose your development environment -In this step you will clone the above forked repository into a development environment. You can choose between either using the Compute Instance (CI) in Azure Machine Learning (this was pre-created as part of the above steps) or your local laptop. We also provide a 3rd option using the Azure Cloud Shell, but this isn't a recommended approach for actual development, but is an alternative to run basic scripts. Based on this choice, follow the related instructions: -- [Option A. Use CI in AML](#Option-A-Use-Compute-Instance-in-AML) -- [Option B. Use Your laptop(PC/MAC)](#Option-B-Use-your-laptop-(PC/MAC)) -- [Option C. Use the Azure Cloud Shell](#Option-C-Use-the-Azure-Cloud-Shell) - -### Option A. Use Compute Instance in AML - -- A1. Go to [Azure Machine Learning Studio](https://ml.azure.com) - -- A2. Go to __Compute__ > __Compute Instance__ - -- A3. Click new __Terminal link__ - -- A4. Clone __your__ 'MLOpsTemplate' repo in the Terminal of Compute Instance - - - Make sure you have forked the repo to your repository - - Before you run following command, update the _{YOURGITHUBACCOUNT}_ part with your GitHub handle (look at your browser URL for the repo you forked earlier to get this information) - - Run the following command to clone the repo: - ```bash - git clone https://github.com/{YOURGITHUBACCOUNT}/MLOpsTemplate.git - ``` - -> Note: Make sure you are running the command from a similar path structure like below: -> `~/cloudfiles/code/Users/YOURALIAS$` - -![](./images/run_mlopsworkshop_azcli004.png) - -> IMPORTANT: setup Git credentials helper (to avoid typing your username / password every time you push some changes) - -```bash -git config --global credential.helper store -``` - -> IMPORTANT: Git push commands setup (replace with the email linked to your github account + specify your full name) - -```bash -git config --global user.email "my_email@my_company.com" -git config --global user.name "Firstname Lastname" -git config --global push.default simple -``` - -- A5. Generate and register data for the workshop - - - Update arguments "_NAMES_ and _ID_" accordingly and then run following commands from the Terminal - - ```bash - cd ./MLOpsTemplate/src/workshop - conda env create -f conda-local.yml - conda activate mlops-workshop-local - python ./data/create_datasets.py --datastore_name workspaceblobstore --ml_workspace_name "AML_WS_NAME" --sub_id "SUBSCRIPTION_ID" --resourcegroup_name "RG_NAME" - ``` - -> Note: You can find the __Resource Group Name, Azure Machine Learning Name__ and the __Location__ from Azure portal. -> ![](./images/run_mlopsworkshop_azcli010.png) - -- A6. Install az ml CLI v2 - - Run the following command to see the `az extension` - ```bash - az extension list - ``` - - If you see the `azure-cli-ml` extension, remove it by running the following command: - ```bash - az extension remove -n azure-cli-ml - ``` - - If you see `ml` extension, remove it by running the following command: - ```bash - az extension remove -n ml - ``` - - Install the latest az `ml` CLI v2 extension by running the following command: - ```bash - az extension add -n ml -y --version 2.2.1 - ``` - -- A7. Setup az cli - - Run the following command from the Terminal: - ```bash - az login - ``` - If you have access to more than 1 tenants, it's advisable to use the syntax below with a designated tenant id to logon to the right tenant - ```bash - az login --tenant "" - ``` - Use the code and follow the instruction to finish the login. - > Note: You need to login in and be authenticated to use the `az cli` extension. - ![](./images/run_mlopsworkshop_azcli006.png) - After copy the __code__ and go to the link, [https://microsoft.com/devicelogin](https://microsoft.com/devicelogin). - - - -- A8. Configure the subscription and Azure Machine Learning Workspace - ```bash - az account set -s "" - az configure --defaults group="" workspace="" location="" - az configure -l -o table - ``` - - > Note: You can find the __Resource Group Name__, __Azure Machine Learning Name__ and __the Location__ from the user profile in the AML Studio. - ![](./images/run_mlopsworkshop_azcli008.png) - - > Note: The results should look like the following: - ![](./images/run_mlopsworkshop_azcli007.png) - -- A9. Create a Service Principal (SP) - - > If you have a Service Principal, please use the existing one. Ignore this step and go to next step 4. - > If you don't have the Service Principal, please follow this step. - - > Note: In case you don't have permission to create SP, please reach out to your Azure infra/security team to get help. - - - Get the following information: - - Your Azure SubscriptionID where your Azure Machine Learning service is - - Resource Group Name where your Azure Machine Learning service is - - (Random) Name for the Service Principal you're about to create - - - To create a Service Principal, run the following command: - - ```bash - az ad sp create-for-rbac --name {REPLACE_SPNAME} --role contributor --scopes /subscriptions/{REPLACE_SUBSCRIPTIONID}/resourceGroups/{REPLACE_RESOURCEGROUPNAME} --sdk-auth - ``` - - ![](./images/arm002.png) - - > IMPORTANT: copy/save the entire json output of that command, we will need this to setup the GitHub Secret that will enable to login into Azure using this Service Principal - - > Note: Once done, leave the terminal open. Do not terminate it and head to [the next step](#4-Configure-secret-in-your-Github-account). - -### Option B. Use your laptop (PC/MAC) -> Note: If you followed Option A, you don't need Option B. - -- B1. Create a local python development environment - - - [Install Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html), [git](https://git-scm.com/downloads?msclkid=1f5aa675b42811ecb1979c5fb8e69812) and your prefered IDE, for example, [VS Code](https://code.visualstudio.com/Download?msclkid=32cd8937b42811ec9681883c942b2912) - - - Use VSCode and VSCode for python if possible - -- B2. Open your local terminal - -- B3. [Install az CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli#install) - -- B4. Install az ml CLI v2 - - Run the following commands from your local terminal - - Check az extension by running following command - - ```bash - az extension list - ``` - - - If you see azure-cli-ml extension, remove it by running following commnad. If you dont see, then move to next step - - ```bash - az extension remove -n azure-cli-ml - ``` - - - If you don't see ml 2.#.# form the extension list, install az ml CLI v2 - - ```bash - az extension add -n ml -y --version 2.2.1 - ``` - -- B5. Setup az cli - - - Run the following command from the Termianl - - ```bash - az login - ``` - If you have access to more than 1 tenants, it's advisable to use the syntax below with a designated tenant id to logon to the right tenant - ```bash - az login --tenant "" - ``` - - You need to follow the guide to use `az cli` for the lab - - ![](./images/run_mlopsworkshop_azcli006.png) - - After copying the __code__ from the terminal, open a new tab, go to the link, [https://microsoft.com/devicelogin](https://microsoft.com/devicelogin). - - Use the code and follow the instructions to finish the login. - -- B6. After logging into the `az cli`, come back to your terminal and configure the subscription and Azure Machine Learning Workspace by running the following commands: - - ```bash - az account set -s "" - az configure --defaults group="" workspace="" location="" - az configure -l -o table - ``` - -> Note: You can find the __Resource Group Name, Azure Machine Learning Name__ and __the Location__ from the user profile in the AML Studio. -![](./images/run_mlopsworkshop_azcli008.png) - -- The results will look like the following: - ![](./images/run_mlopsworkshop_azcli007.png) - -- B7. Clone your 'MLOpsTemplate' repo - - Before you run following command, upate __{YOURGITHUBACCOUNT}__ part - - Sample command looks like following - - ```bash - git clone https://github.com/{YOURGITHUBACCOUNT}/MLOpsTemplate.git - ``` - -> IMPORTANT: if this is the first time using Git on your laptop, setup Git credentials helper (to avoid typing your username / password every time you push some changes) - -```bash -git config --global credential.helper store -``` - -> IMPORTANT: if this is your first time using Git on your laptop, go through this Git push commands setup (replace with the email linked to your github account + specify your full name) - -```bash -git config --global user.email "my_email@my_company.com" -git config --global user.name "Firstname Lastname" -git config --global push.default simple -``` - - - Using conda, create a new virtual environment or use an existing virtual environment with azureml-sdk, pandas, sckit-learn - - - If you don't have an existing conda virtual environment, use following command to create new - - ```bash - cd ./MLOpsTemplate/src/workshop - conda env create -f conda-local.yml - ``` - -- B8. Generate and register data for the workshop - - Update arguments __"NAMES and ID"__ accordingly and then run following commands from your local terminal - > You should run the commands from the path, __'MLOpsTemplate/src/workshop$'__ - - ```bash - conda activate mlops-workshop-local - python ./data/create_datasets.py --datastore_name workspaceblobstore --ml_workspace_name "AML_WS_NAME" --sub_id "SUBSCRIPTION_ID" --resourcegroup_name "RG_NAME" - ``` - -- B9. Create Service Principal - - > If you have a Service Principal, please use the existing one. Ignore this step and go to next step 4. - > If you don't have the Service Principal, please follow this step. - - - Get following information - - - Your Azure SubscriptionID - - Resource Group Name - - - Update Run following command from the terminal - - ```bash - az ad sp create-for-rbac --name {REPLACE_SPNAME} --role contributor --scopes /subscriptions/{REPLACE_SUBSCRIPTIONID}/resourceGroups/{REPLACE_RESOURCEGROUPNAME} --sdk-auth - ``` - - ![](./images/arm002.png) - - > IMPORTANT: copy/save the entire json output of that command, we will need this to setup the GitHub Secret that will enable to login into Azure using this Service Principal - - > Note: Once done, leave the terminal open. Do not terminate it yet - -### Option C. Use the Azure Cloud Shell - -> Note: If you followed Option A or B, you do not need to to through Option C - -- C1. Launch the Azure Cloud Shell - - - Go to the Azure Portal, click on the Azure Cloud Shell icon on the right side of the top search bar: - - ![](./images/cloudshell2.png) - - Accept the prompt to create a default storage account to host some of the files the Azure Cloud Shell requires to function. The Cloud Shell gives you access to a terminal (PowerShell or Bash) to execute commands within Azure. - - Select Bash. You'll end up with a screen like this once it's started up: - - ![](./images/cloudshell-firstlaunch.png) - -- C2. Check Python Version - - We will need be using a version of Python 3.8. Check to see which version you are using by running the following command: - ``` bash - python -V - ``` - If the output is Python 3.8.X (any version), continue to step C4. If you need to change your Python version, follow the next steps. - -- C3. Download Python 3.8 - 1. ```bash - wget https://repo.continuum.io/miniconda/Miniconda3-py38_4.9.2-Linux-x86_64.sh - ``` - 2. ``` bash - sh Miniconda3-py38_4.9.2-Linux-x86_64.sh - ``` - 3. Accept the agreement and install on the default path - ![](./images/cloudshell-accept.png) - 4. ``` bash - export PATH=~/miniconda3/bin:$PATH - ``` - 5. Verify Python Version - ``` bash - python -V - ``` - -- C4. Install The Azure CLI Machine Learning extension v2 (aka az ml) - - ``` bash - az extension add -n ml -y --version 2.2.1 - ``` - -- C5. Login the CLI to Azure - - - Run the following command from the Terminal - - ```bash - az login - ``` - If you have access to more than 1 tenants, it's advisable to use the syntax below with a designated tenant id to logon to the right tenant - ```bash - az login --tenant "" - ``` - - You need to follow the guide to use `az cli` for the lab - - ![](./images/run_mlopsworkshop_azcli006.png) - - After copying the __code__ from the terminal, open a new tab, go to the link, [https://microsoft.com/devicelogin](https://microsoft.com/devicelogin). - - Use the code and follow the instructions to finish the login. - -- C6. After logging into the `az cli`, come back to your terminal and configure the subscription and Azure Machine Learning Workspace by running the following commands: - - ```bash - az account set -s "" - az configure --defaults group="" workspace="" location="" - az configure -l -o table - ``` - -> Note: You can find the __Resource Group Name, Azure Machine Learning Name__ and __the Location__ from the user profile in the AML Studio. -![](./images/run_mlopsworkshop_azcli008.png) - -- The results will look like the following: - ![](./images/run_mlopsworkshop_azcli007.png) - -- C7. Clone your 'MLOpsTemplate' repo and setup your environment - - Before you run following command, upate __{YOURGITHUBACCOUNT}__ part - - Sample command looks like following - - ```bash - git clone https://github.com/{YOURGITHUBACCOUNT}/MLOpsTemplate.git - cd ./MLOpsTemplate/src/workshop - ``` - -> IMPORTANT: setup Git credentials helper (to avoid typing your username / password every time you push some changes) - -```bash -git config --global credential.helper store -``` - -> IMPORTANT: Git push commands setup (replace with the email linked to your github account + specify your full name) - -```bash -git config --global user.email "my_email@my_company.com" -git config --global user.name "Firstname Lastname" -git config --global push.default simple -``` - - - Upgrade pip to this specific version: - - ``` bash - python -m pip install pip==21.3.1 - ``` - - - Using pip, we will install some required packages to run this workshop - - ```bash - pip install -r requirements-local.txt - ``` - -- C8. Generate and register data for the workshop - - Update arguments __"NAMES and ID"__ accordingly and then run following commands from your local terminal - > You should run the commands from the path, __'MLOpsTemplate/src/workshop$'__ - - ```bash - python ./data/create_datasets.py --datastore_name workspaceblobstore --ml_workspace_name "AML_WS_NAME" --sub_id "SUBSCRIPTION_ID" --resourcegroup_name "RG_NAME" - ``` - -- C9. Create Service Principal - - > If you have a Service Principal, please use the existing one. Ignore this step and go to next step 4. - > If you don't have the Service Principal, please follow this step. - - - Get following information - - - Your Azure SubscriptionID - - Resource Group Name - - - Update Run following command from the terminal - - ```bash - az ad sp create-for-rbac --name {REPLACE_SPNAME} --role contributor --scopes /subscriptions/{REPLACE_SUBSCRIPTIONID}/resourceGroups/{REPLACE_RESOURCEGROUPNAME} --sdk-auth - ``` - - ![](./images/arm002.png) - - > IMPORTANT: copy/save the entire json output of that command, we will need this to setup the GitHub Secret that will enable to login into Azure using this Service Principal - - > Note: Once done, leave the terminal open. Do not terminate it yet - - -## 4. Configure secret in your Github account - -The last two tasks include: - - Creating a Personal Access Token (PAT) in Github - - Adding a Service Principal (SP) to your forked repository in Github - - -### 4.1 Create PAT (Personal Access Token) - -You are going to create PAT to allow your code access your personal git repo - -- To make PAT, you need to go to Settings of your account, NOT repo setting - - ![](./images/github4003.png) - -- From the setting, find and __click__ '_<> Developer settings_' menu at the bottom left conner of your screen - - ![](./images/github4004.png) - -- __Click__ '_Personal access token_' and __click__ '_Generate new token_' - - ![](./images/github4005.png) - -- Check for '_repo_' and '_workflow_' for the scope and then __click__ '_create_' at the bottom of your screen - - ![](./images/github4006.png) - -- You'll see the token. Make sure you copy and keep it safe. - - ![](./images/github4007.png) - -- Now you're going to add the token to your repo - -- Go back to your 'MLOpsTemplate' repo where your forked from microsoft/MLOpsTemplate - - - The url of your repo will looks like this - - ```text - https://github.com/{YOURACCOUNT}}/MLOpsTemplate - ``` - -- From your repo __click__ '_Setting_' - - ![](./images/github4000.png) - -- Find a menu '_Secrets_' on the left side of menu, and __click__ 'Actions'. After that __Click__ 'New repository secret' - - ![](./images/github4001.png) - -- Type `PERSONAL_ACCESS_TOKEN_GITHUB` for the name of the secret, and paste the token you copied from PAT section - - > Important: The name for this secret must be `PERSONAL_ACCESS_TOKEN_GITHUB` - - ![](./images/github4008.png) - - - - -### 4.2 Add SP to your repo in Github - -From this section, you'll add the SP information to your repo. The SP information will be used during the Github Actions. - -You have saved in step A9, B9 or C7 the output of the SP creation command, it should look like this: - -```json -{ - "clientId": "YOUR_APP_ID", - "clientSecret": "YOUR_CLIENT_SECRET", - "subscriptionId": "SUB_ID", - "tenantId": "TENANT_ID", - "activeDirectoryEndpointUrl": "https://login.microsoftonline.com", - "resourceManagerEndpointUrl": "https://management.azure.com/", - "activeDirectoryGraphResourceId": "https://graph.windows.net/", - "sqlManagementEndpointUrl": "https://management.core.windows.net:8443/", - "galleryEndpointUrl": "https://gallery.azure.com/", - "managementEndpointUrl": "https://management.core.windows.net/" -} -``` - -- Go back to your 'MLOpsTemplate' repo where your forked from microsoft/MLOpsTemplate - - - The url of your repo will looks like this - - ```text - https://github.com/{YOURACCOUNT}}/MLOpsTemplate - ``` - -- From your repo __click__ '_Setting_' - - ![](./images/github4000.png) - -- Find a menu '_Secrets_' on the left side of menu, and __click__ 'Actions'. After that __Click__ 'New repository secret' - - ![](./images/github4001.png) - -- Type `AZURE_SERVICE_PRINCIPAL` for the name of the secret, and paste your SP json definition: - - > Important: The name for this secret must be `AZURE_SERVICE_PRINCIPAL` - - ![](./images/github4002.png) - - -## [Go to Part 1](part_1.md) +# Part 0: Workshop Environment Setup +> Note: Read the Workshop scenario overview [here](https://github.com/microsoft/MLOpsTemplate/blob/main/src/workshop/README.md#workshop-scenario) + +## Goal +- Setup Azure ML workspace and components +- Setup github account, a personal access token and configure settings +- Setup local python development environment +- Generate and register data for the workshop +- Setup SP (Service Principal) + +## Pre-requisites for part 0 +- An Azure Account and Subscription +- An understanding of: + - Azure Subscriptions and Resource Groups + - Service Principals + - Github mechanics (creating an account, forking a repo, etc.) + +## Steps + +0. [Check list](./part_tips.md) + +1. [Create Azure Machine Learning resources in Azure](#1-Create-resources-in-Azure) + +2. [Setup Github account and settings](#2-Setup-github-account-and-settings) + +3. [Setup your development environment](#3-Choose-your-development-environment) + - Option A: Use CI as your local in AML + - Option B: Use your local machine (PC or MAC) + +4. [Configure secret in your Github account](#4-Configure-secret-in-your-Github-account) + - Create a Personal Accesss Token (PAT) + - Add a Service Principal (SP) to your repo in Github + +> Note: For a detailed video walk-through of the process, click on the video below to walk through Steps 0 to 3. +> [![VideoGuide](./images/video_img.png)](https://youtu.be/k9ExpebwR18) + + +## 1. Create resources in Azure + +> IMPORTANT: You can skip this section if you've been provided a subscription already setup by Microsoft. + +>Note: To create resources you need an `Owner` or `Contributor` role for your subscription or resource group. If you don't have either one of these roles, you cannot create any of the following resources. + +- 1.1 Open the following link in a new tab. + + [![Deploy to Azure](./images/deploy-to-azure.svg)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fmicrosoft%2FMLOpsTemplate%2Fmain%2Fsrc%2Fworkshop%2Fdocuments%2FIaC%2Fiac_EZ_MLOps.json) + +- 1.2 Fill out the rest and click the `Create` button at the bottom. + + ![](./images/arm000.png) + + > Note: The provisioning will take 4 mins to 5 mins. If you want to see the progress of the provisioning, you can click the 'Notification' icon. When the provisioning is done, leave the tab open (don't close it!). You can leave it and open a new tab in your browser for the next step. + ![](./images/arm001.png) + + > IMPORTANT: If this deployment fails, you cannot do the following steps in the workshop. Please inform your CSA or lab instructor with the error message. + +## 2. Setup Github Account and Settings + +- 2.1 From the new browser tab, go to [Github](https://github.com/) and login to your account. + > Note: If you don't have an account for Github, please sign up. The workshop can't be done without a Github account. + +- 2.2 After the login, go to [https://github.com/microsoft/MLOpsTemplate](https://github.com/microsoft/MLOpsTemplate) and click `Fork`. + ![](./images/run_mlopsworkshop_azcli009.png) + + > Note: You will have the same repository (`MLOpsTemplate`) under your Github account name. + > Leave the tab open and **do not** close it yet. You will come back to your repository. + +## 3. Choose your development environment +In this step you will clone the above forked repository into a development environment. You can choose between either using the Compute Instance (CI) in Azure Machine Learning (this was pre-created as part of the above steps) or your local laptop. We also provide a 3rd option using the Azure Cloud Shell, but this isn't a recommended approach for actual development, but is an alternative to run basic scripts. Based on this choice, follow the related instructions: +- [Option A. Use CI in AML](#Option-A-Use-Compute-Instance-in-AML) +- [Option B. Use Your laptop(PC/MAC)](#Option-B-Use-your-laptop-(PC/MAC)) +- [Option C. Use the Azure Cloud Shell](#Option-C-Use-the-Azure-Cloud-Shell) + +### Option A. Use Compute Instance in AML + +- A1. Go to [Azure Machine Learning Studio](https://ml.azure.com) + +- A2. Go to __Compute__ > __Compute Instance__ + +- A3. Click new __Terminal link__ + +- A4. Clone __your__ 'MLOpsTemplate' repo in the Terminal of Compute Instance + + - Make sure you have forked the repo to your repository + - Before you run following command, update the _{YOURGITHUBACCOUNT}_ part with your GitHub handle (look at your browser URL for the repo you forked earlier to get this information) + - Run the following command to clone the repo: + ```bash + git clone https://github.com/{YOURGITHUBACCOUNT}/MLOpsTemplate.git + ``` + +> Note: Make sure you are running the command from a similar path structure like below: +> `~/cloudfiles/code/Users/YOURALIAS$` + +![](./images/run_mlopsworkshop_azcli004.png) + +> IMPORTANT: setup Git credentials helper (to avoid typing your username / password every time you push some changes) + +```bash +git config --global credential.helper store +``` + +> IMPORTANT: Git push commands setup (replace with the email linked to your github account + specify your full name) + +```bash +git config --global user.email "my_email@my_company.com" +git config --global user.name "Firstname Lastname" +git config --global push.default simple +``` + +- A5. Generate and register data for the workshop + + - Update arguments "_NAMES_ and _ID_" accordingly and then run following commands from the Terminal + + ```bash + cd ./MLOpsTemplate/src/workshop + conda env create -f conda-local.yml + conda activate mlops-workshop-local + python ./data/create_datasets.py --datastore_name workspaceblobstore --ml_workspace_name "AML_WS_NAME" --sub_id "SUBSCRIPTION_ID" --resourcegroup_name "RG_NAME" + ``` + +> Note: You can find the __Resource Group Name, Azure Machine Learning Name__ and the __Location__ from Azure portal. +> ![](./images/run_mlopsworkshop_azcli010.png) + +- A6. Install az ml CLI v2 + - Run the following command to see the `az extension` + ```bash + az extension list + ``` + - If you see the `azure-cli-ml` extension, remove it by running the following command: + ```bash + az extension remove -n azure-cli-ml + ``` + - If you see `ml` extension, remove it by running the following command: + ```bash + az extension remove -n ml + ``` + - Install the latest az `ml` CLI v2 extension by running the following command: + ```bash + az extension add -n ml -y --version 2.2.1 + ``` + +- A7. Setup az cli + - Run the following command from the Terminal: + ```bash + az login + ``` + If you have access to more than 1 tenants, it's advisable to use the syntax below with a designated tenant id to logon to the right tenant + ```bash + az login --tenant "" + ``` + Use the code and follow the instruction to finish the login. + > Note: You need to login in and be authenticated to use the `az cli` extension. + ![](./images/run_mlopsworkshop_azcli006.png) + After copy the __code__ and go to the link, [https://microsoft.com/devicelogin](https://microsoft.com/devicelogin). + + + +- A8. Configure the subscription and Azure Machine Learning Workspace + ```bash + az account set -s "" + az configure --defaults group="" workspace="" location="" + az configure -l -o table + ``` + + > Note: You can find the __Resource Group Name__, __Azure Machine Learning Name__ and __the Location__ from the user profile in the AML Studio. + ![](./images/run_mlopsworkshop_azcli008.png) + + > Note: The results should look like the following: + ![](./images/run_mlopsworkshop_azcli007.png) + +- A9. Create a Service Principal (SP) + + > If you have a Service Principal, please use the existing one. Ignore this step and go to next step 4. + > If you don't have the Service Principal, please follow this step. + + > Note: In case you don't have permission to create SP, please reach out to your Azure infra/security team to get help. + + - Get the following information: + - Your Azure SubscriptionID where your Azure Machine Learning service is + - Resource Group Name where your Azure Machine Learning service is + - (Random) Name for the Service Principal you're about to create + + - To create a Service Principal, run the following command: + + ```bash + az ad sp create-for-rbac --name {REPLACE_SPNAME} --role contributor --scopes /subscriptions/{REPLACE_SUBSCRIPTIONID}/resourceGroups/{REPLACE_RESOURCEGROUPNAME} --sdk-auth + ``` + + ![](./images/arm002.png) + + > IMPORTANT: copy/save the entire json output of that command, we will need this to setup the GitHub Secret that will enable to login into Azure using this Service Principal + + > Note: Once done, leave the terminal open. Do not terminate it and head to [the next step](#4-Configure-secret-in-your-Github-account). + +### Option B. Use your laptop (PC/MAC) +> Note: If you followed Option A, you don't need Option B. + +- B1. Create a local python development environment + + - [Install Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html), [git](https://git-scm.com/downloads?msclkid=1f5aa675b42811ecb1979c5fb8e69812) and your prefered IDE, for example, [VS Code](https://code.visualstudio.com/Download?msclkid=32cd8937b42811ec9681883c942b2912) + + - Use VSCode and VSCode for python if possible + +- B2. Open your local terminal + +- B3. [Install az CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli#install) + +- B4. Install az ml CLI v2 + - Run the following commands from your local terminal + - Check az extension by running following command + + ```bash + az extension list + ``` + + - If you see azure-cli-ml extension, remove it by running following commnad. If you dont see, then move to next step + + ```bash + az extension remove -n azure-cli-ml + ``` + + - If you don't see ml 2.#.# form the extension list, install az ml CLI v2 + + ```bash + az extension add -n ml -y --version 2.2.1 + ``` + +- B5. Setup az cli + + - Run the following command from the Termianl + + ```bash + az login + ``` + If you have access to more than 1 tenants, it's advisable to use the syntax below with a designated tenant id to logon to the right tenant + ```bash + az login --tenant "" + ``` + - You need to follow the guide to use `az cli` for the lab + + ![](./images/run_mlopsworkshop_azcli006.png) + + After copying the __code__ from the terminal, open a new tab, go to the link, [https://microsoft.com/devicelogin](https://microsoft.com/devicelogin). + + Use the code and follow the instructions to finish the login. + +- B6. After logging into the `az cli`, come back to your terminal and configure the subscription and Azure Machine Learning Workspace by running the following commands: + + ```bash + az account set -s "" + az configure --defaults group="" workspace="" location="" + az configure -l -o table + ``` + +> Note: You can find the __Resource Group Name, Azure Machine Learning Name__ and __the Location__ from the user profile in the AML Studio. +![](./images/run_mlopsworkshop_azcli008.png) + +- The results will look like the following: + ![](./images/run_mlopsworkshop_azcli007.png) + +- B7. Clone your 'MLOpsTemplate' repo + - Before you run following command, upate __{YOURGITHUBACCOUNT}__ part + - Sample command looks like following + + ```bash + git clone https://github.com/{YOURGITHUBACCOUNT}/MLOpsTemplate.git + ``` + +> IMPORTANT: if this is the first time using Git on your laptop, setup Git credentials helper (to avoid typing your username / password every time you push some changes) + +```bash +git config --global credential.helper store +``` + +> IMPORTANT: if this is your first time using Git on your laptop, go through this Git push commands setup (replace with the email linked to your github account + specify your full name) + +```bash +git config --global user.email "my_email@my_company.com" +git config --global user.name "Firstname Lastname" +git config --global push.default simple +``` + + - Using conda, create a new virtual environment or use an existing virtual environment with azureml-sdk, pandas, sckit-learn + + - If you don't have an existing conda virtual environment, use following command to create new + + ```bash + cd ./MLOpsTemplate/src/workshop + conda env create -f conda-local.yml + ``` + +- B8. Generate and register data for the workshop + - Update arguments __"NAMES and ID"__ accordingly and then run following commands from your local terminal + > You should run the commands from the path, __'MLOpsTemplate/src/workshop$'__ + + ```bash + conda activate mlops-workshop-local + python ./data/create_datasets.py --datastore_name workspaceblobstore --ml_workspace_name "AML_WS_NAME" --sub_id "SUBSCRIPTION_ID" --resourcegroup_name "RG_NAME" + ``` + +- B9. Create Service Principal + + > If you have a Service Principal, please use the existing one. Ignore this step and go to next step 4. + > If you don't have the Service Principal, please follow this step. + + - Get following information + + - Your Azure SubscriptionID + - Resource Group Name + + - Update Run following command from the terminal + + ```bash + az ad sp create-for-rbac --name {REPLACE_SPNAME} --role contributor --scopes /subscriptions/{REPLACE_SUBSCRIPTIONID}/resourceGroups/{REPLACE_RESOURCEGROUPNAME} --sdk-auth + ``` + + ![](./images/arm002.png) + + > IMPORTANT: copy/save the entire json output of that command, we will need this to setup the GitHub Secret that will enable to login into Azure using this Service Principal + + > Note: Once done, leave the terminal open. Do not terminate it yet + +### Option C. Use the Azure Cloud Shell + +> Note: If you followed Option A or B, you do not need to to through Option C + +- C1. Launch the Azure Cloud Shell + + - Go to the Azure Portal, click on the Azure Cloud Shell icon on the right side of the top search bar: + + ![](./images/cloudshell2.png) + + Accept the prompt to create a default storage account to host some of the files the Azure Cloud Shell requires to function. The Cloud Shell gives you access to a terminal (PowerShell or Bash) to execute commands within Azure. + + Select Bash. You'll end up with a screen like this once it's started up: + + ![](./images/cloudshell-firstlaunch.png) + +- C2. Check Python Version + - We will need be using a version of Python 3.8. Check to see which version you are using by running the following command: + ``` bash + python -V + ``` + If the output is Python 3.8.X (any version), continue to step C4. If you need to change your Python version, follow the next steps. + +- C3. Download Python 3.8 + 1. ```bash + wget https://repo.continuum.io/miniconda/Miniconda3-py38_4.9.2-Linux-x86_64.sh + ``` + 2. ``` bash + sh Miniconda3-py38_4.9.2-Linux-x86_64.sh + ``` + 3. Accept the agreement and install on the default path + ![](./images/cloudshell-accept.png) + 4. ``` bash + export PATH=~/miniconda3/bin:$PATH + ``` + 5. Verify Python Version + ``` bash + python -V + ``` + +- C4. Install The Azure CLI Machine Learning extension v2 (aka az ml) + + ``` bash + az extension add -n ml -y --version 2.2.1 + ``` + +- C5. Login the CLI to Azure + + - Run the following command from the Terminal + + ```bash + az login + ``` + If you have access to more than 1 tenants, it's advisable to use the syntax below with a designated tenant id to logon to the right tenant + ```bash + az login --tenant "" + ``` + - You need to follow the guide to use `az cli` for the lab + + ![](./images/run_mlopsworkshop_azcli006.png) + + After copying the __code__ from the terminal, open a new tab, go to the link, [https://microsoft.com/devicelogin](https://microsoft.com/devicelogin). + + Use the code and follow the instructions to finish the login. + +- C6. After logging into the `az cli`, come back to your terminal and configure the subscription and Azure Machine Learning Workspace by running the following commands: + + ```bash + az account set -s "" + az configure --defaults group="" workspace="" location="" + az configure -l -o table + ``` + +> Note: You can find the __Resource Group Name, Azure Machine Learning Name__ and __the Location__ from the user profile in the AML Studio. +![](./images/run_mlopsworkshop_azcli008.png) + +- The results will look like the following: + ![](./images/run_mlopsworkshop_azcli007.png) + +- C7. Clone your 'MLOpsTemplate' repo and setup your environment + - Before you run following command, upate __{YOURGITHUBACCOUNT}__ part + - Sample command looks like following + + ```bash + git clone https://github.com/{YOURGITHUBACCOUNT}/MLOpsTemplate.git + cd ./MLOpsTemplate/src/workshop + ``` + +> IMPORTANT: setup Git credentials helper (to avoid typing your username / password every time you push some changes) + +```bash +git config --global credential.helper store +``` + +> IMPORTANT: Git push commands setup (replace with the email linked to your github account + specify your full name) + +```bash +git config --global user.email "my_email@my_company.com" +git config --global user.name "Firstname Lastname" +git config --global push.default simple +``` + + - Upgrade pip to this specific version: + + ``` bash + python -m pip install pip==21.3.1 + ``` + + - Using pip, we will install some required packages to run this workshop + + ```bash + pip install -r requirements-local.txt + ``` + +- C8. Generate and register data for the workshop + - Update arguments __"NAMES and ID"__ accordingly and then run following commands from your local terminal + > You should run the commands from the path, __'MLOpsTemplate/src/workshop$'__ + + ```bash + python ./data/create_datasets.py --datastore_name workspaceblobstore --ml_workspace_name "AML_WS_NAME" --sub_id "SUBSCRIPTION_ID" --resourcegroup_name "RG_NAME" + ``` + +- C9. Create Service Principal + + > If you have a Service Principal, please use the existing one. Ignore this step and go to next step 4. + > If you don't have the Service Principal, please follow this step. + + - Get following information + + - Your Azure SubscriptionID + - Resource Group Name + + - Update Run following command from the terminal + + ```bash + az ad sp create-for-rbac --name {REPLACE_SPNAME} --role contributor --scopes /subscriptions/{REPLACE_SUBSCRIPTIONID}/resourceGroups/{REPLACE_RESOURCEGROUPNAME} --sdk-auth + ``` + + ![](./images/arm002.png) + + > IMPORTANT: copy/save the entire json output of that command, we will need this to setup the GitHub Secret that will enable to login into Azure using this Service Principal + + > Note: Once done, leave the terminal open. Do not terminate it yet + + +## 4. Configure secret in your Github account + +The last two tasks include: + - Creating a Personal Access Token (PAT) in Github + - Adding a Service Principal (SP) to your forked repository in Github + + +### 4.1 Create PAT (Personal Access Token) + +You are going to create PAT to allow your code access your personal git repo + +- To make PAT, you need to go to Settings of your account, NOT repo setting + + ![](./images/github4003.png) + +- From the setting, find and __click__ '_<> Developer settings_' menu at the bottom left conner of your screen + + ![](./images/github4004.png) + +- __Click__ '_Personal access token_' and __click__ '_Generate new token_' + + ![](./images/github4005.png) + +- Check for '_repo_' and '_workflow_' for the scope and then __click__ '_create_' at the bottom of your screen + + ![](./images/github4006.png) + +- You'll see the token. Make sure you copy and keep it safe. + + ![](./images/github4007.png) + +- Now you're going to add the token to your repo + +- Go back to your 'MLOpsTemplate' repo where your forked from microsoft/MLOpsTemplate + + - The url of your repo will looks like this + + ```text + https://github.com/{YOURACCOUNT}}/MLOpsTemplate + ``` + +- From your repo __click__ '_Setting_' + + ![](./images/github4000.png) + +- Find a menu '_Secrets_' on the left side of menu, and __click__ 'Actions'. After that __Click__ 'New repository secret' + + ![](./images/github4001.png) + +- Type `PERSONAL_ACCESS_TOKEN_GITHUB` for the name of the secret, and paste the token you copied from PAT section + + > Important: The name for this secret must be `PERSONAL_ACCESS_TOKEN_GITHUB` + + ![](./images/github4008.png) + + + + +### 4.2 Add SP to your repo in Github + +From this section, you'll add the SP information to your repo. The SP information will be used during the Github Actions. + +You have saved in step A9, B9 or C7 the output of the SP creation command, it should look like this: + +```json +{ + "clientId": "YOUR_APP_ID", + "clientSecret": "YOUR_CLIENT_SECRET", + "subscriptionId": "SUB_ID", + "tenantId": "TENANT_ID", + "activeDirectoryEndpointUrl": "https://login.microsoftonline.com", + "resourceManagerEndpointUrl": "https://management.azure.com/", + "activeDirectoryGraphResourceId": "https://graph.windows.net/", + "sqlManagementEndpointUrl": "https://management.core.windows.net:8443/", + "galleryEndpointUrl": "https://gallery.azure.com/", + "managementEndpointUrl": "https://management.core.windows.net/" +} +``` + +- Go back to your 'MLOpsTemplate' repo where your forked from microsoft/MLOpsTemplate + + - The url of your repo will looks like this + + ```text + https://github.com/{YOURACCOUNT}}/MLOpsTemplate + ``` + +- From your repo __click__ '_Setting_' + + ![](./images/github4000.png) + +- Find a menu '_Secrets_' on the left side of menu, and __click__ 'Actions'. After that __Click__ 'New repository secret' + + ![](./images/github4001.png) + +- Type `AZURE_SERVICE_PRINCIPAL` for the name of the secret, and paste your SP json definition: + + > Important: The name for this secret must be `AZURE_SERVICE_PRINCIPAL` + + ![](./images/github4002.png) + + +## [Go to Part 1](part_1.md) diff --git a/src/workshop/documents/part_1.md b/src/workshop/documents/part_1.md index 925569c1..2e3de5e6 100644 --- a/src/workshop/documents/part_1.md +++ b/src/workshop/documents/part_1.md @@ -1,105 +1,105 @@ - -# Part 1: Structure code for fast iterative development -## Pre-requisites -- Complete [Part 0](part_0.md) to setup the Azure ML workspace. Ensure the following: - - Your conda environment ``mlops-workshop-local`` is activated. - - You completed the step to run [create_datasets.py](part_0.md#option-a-use-compute-instance-for-code-development). - -## Summary -Your team has been working on a new ML problem. The team has been performing exploratory work on data and algorithm and has come to a state that the solution direction is solidified. Now, it is a time to put a structure into the work so that the team can iterate faster toward building a fully functional solution.   - -So far, team members have been working mostly on Jupyter notebooks on their personal compute (Azure CI & PC). As the first step in MLOps, your team needs to accompblish the followings:  - -- Modularization: monolithic notebook is refactored into python modules that can be developed and tested independently and in parallel by multiple members -- Parameterization: The modules are parameterized so that they be rerun with different parameter values. - -To illustrate how the process works, the notebook was refactored into a feature engineering module, an ml training module and an ml evaluating module and you will run these modules individually in local development environment to see how they work. - - ![monolithic to modular](./images/monolithic_modular.png) - -## Steps - -> Note: You can run following tasks on Compute Instance in your Azure Machine Learning. You can use __Jupyter__ or __VSCode__. - -1. Familiarize yourself with the steps in this [jupyter - notebook](../notebooks/taxi-tutorial.ipynb). This showcases the overall data engineering and model building - process. **There is no need to run this as part of this workshop.** - > Note: If you do want to run this notebook, it is recommended to run this in a virtual environment using the conda dependencies specified in this file: `MLOpsTemplate/src/workshop/conda-local.yml`. Additionally, if you run the notebook from a Compute Instance, you can first configure your conda environment with these dependencies, and then leverage the ability to add new kernels referenced [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-access-terminal#add-new-kernels) to run your notebook. - -2. Discuss in your team why a monolithic code structure is a challenge to a scalable and repeatable ML development process? - > Note: Now observe how the monolithic notebook was refactored into a feature/data engineering module, a ML training module and a model validation module so that they can be developed and run independently. - -3. Go to the workshop folder. - > Action Items: Run the following code snippet. - ```bash - cd src/workshop - ``` - > Note: Review the ```workshop/data``` folder. There are data files that were created by the data generation process. The same data files were also sent to the Azure Machine Learning Studio's default datastore under ```workspaceblobstore/mlops_workshop/data```. -4. Create your own development branch where you can make and track changes. This branch will be your development area to create and test new code or pipelines before committing or merging the code into a common branch, such as ```integration```. - - - Run following command to create a new branch named "yourname-dev" - ```bash - git checkout -b yourname-dev - ``` - - This will set the working branch to ```yourname-dev```. To check, run the following command: - ```bash - git branch - ``` -5. Review the refactored engineering logic from the notebook at ```feature_engineering.py``` module under the ```data_engineering``` folder. - - The module performs the following: - - Accepts the following parameters: - - ```input_folder```: path to a folder for input data. The value for local test run is ```data``` - - ```prep_data```: path to a folder for output data. The value for local test run is ```data``` - - ```public_holiday_file_name```: name of the public holiday file. The value for local test run is ```holidays.parquet``` - - ```weather_file_name```: name of the weather raw file.It's ```weather.parquet``` - - ```nyc_file_name```: name of the newyork taxi raw file. It's ```green_taxi.parquet``` - - Performs data transformation, data merging and feature engineering logics - - Splits the data into train and test sets where test_size is 20% - - Writes the output data files to output folder - > Action Item: Run the following code snippet. - ```bash - python core/data_engineering/feature_engineering.py \ - --input_folder data \ - --prep_data data \ - --public_holiday_file_name holidays.parquet \ - --weather_file_name weather.parquet \ - --nyc_file_name green_taxi.parquet -5. Review the refactored ML training logic at ```ml_training.py``` module under training folder. - - The module performs the following: - - Accepts the following parameters: - - ```prep_data```: path to a folder for input data. The value for local test run is ```data``` - - ```input_file_name```: name of the input train data file. The value for local test run is ```final_df.parquet``` - - ```model_folder```: path to a output folder to save trained model.The value for local test run is ```data``` - - Splits input train data into train and validation dataset, perform training - - Prints out MAPE, R2 and RMSE metrics - - Writes the train model file to output folder - > Action Item: Run the following code snippet. - ```bash - python core/training/ml_training.py \ - --prep_data data \ - --input_file_name final_df.parquet \ - --model_folder data -6. Review the refactored ML training logic at ```ml_evaluating.py``` module under evaluating folder. - - The module performs the following: - - Accepts the following parameters: - - ```prep_data```: path to a folder for test input data.The value for local test run is ```data```. - - ```input_file_name```: name of the input test data file. The value for local test run is ```test_df.parquet```. - - ```model_folder```: path to a model folder.The value for local test run is ```data``` - - Loads the model - - Scores the model on input test data, print out MAPE, R2 and RMSE metrics - > Action Item: Run the following code snippet. - ```bash - python core/evaluating/ml_evaluating.py \ - --prep_data data \ - --input_file_name test_df.parquet - -## Success criteria -- Feature engineering module: - - Data is processed correctly and output to a folder as final_df.parquet and test_df.parquet files and ready to be ML trained -- ML training module - - Perform ML training and print out MAPE, R2 and RMSE metrics from input datasets - - Produce the model at the output location -- ML evaluating module - - Perform ML training and print out MAPE, R2 and RMSE metrics from an input dataset and output a model file - -## [Go to Part 2](part_2.md) + +# Part 1: Structure code for fast iterative development +## Pre-requisites +- Complete [Part 0](part_0.md) to setup the Azure ML workspace. Ensure the following: + - Your conda environment ``mlops-workshop-local`` is activated. + - You completed the step to run [create_datasets.py](part_0.md#option-a-use-compute-instance-for-code-development). + +## Summary +Your team has been working on a new ML problem. The team has been performing exploratory work on data and algorithm and has come to a state that the solution direction is solidified. Now, it is a time to put a structure into the work so that the team can iterate faster toward building a fully functional solution.   + +So far, team members have been working mostly on Jupyter notebooks on their personal compute (Azure CI & PC). As the first step in MLOps, your team needs to accompblish the followings:  + +- Modularization: monolithic notebook is refactored into python modules that can be developed and tested independently and in parallel by multiple members +- Parameterization: The modules are parameterized so that they be rerun with different parameter values. + +To illustrate how the process works, the notebook was refactored into a feature engineering module, an ml training module and an ml evaluating module and you will run these modules individually in local development environment to see how they work. + + ![monolithic to modular](./images/monolithic_modular.png) + +## Steps + +> Note: You can run following tasks on Compute Instance in your Azure Machine Learning. You can use __Jupyter__ or __VSCode__. + +1. Familiarize yourself with the steps in this [jupyter + notebook](../notebooks/taxi-tutorial.ipynb). This showcases the overall data engineering and model building + process. **There is no need to run this as part of this workshop.** + > Note: If you do want to run this notebook, it is recommended to run this in a virtual environment using the conda dependencies specified in this file: `MLOpsTemplate/src/workshop/conda-local.yml`. Additionally, if you run the notebook from a Compute Instance, you can first configure your conda environment with these dependencies, and then leverage the ability to add new kernels referenced [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-access-terminal#add-new-kernels) to run your notebook. + +2. Discuss in your team why a monolithic code structure is a challenge to a scalable and repeatable ML development process? + > Note: Now observe how the monolithic notebook was refactored into a feature/data engineering module, a ML training module and a model validation module so that they can be developed and run independently. + +3. Go to the workshop folder. + > Action Items: Run the following code snippet. + ```bash + cd src/workshop + ``` + > Note: Review the ```workshop/data``` folder. There are data files that were created by the data generation process. The same data files were also sent to the Azure Machine Learning Studio's default datastore under ```workspaceblobstore/mlops_workshop/data```. +4. Create your own development branch where you can make and track changes. This branch will be your development area to create and test new code or pipelines before committing or merging the code into a common branch, such as ```integration```. + + - Run following command to create a new branch named "yourname-dev" + ```bash + git checkout -b yourname-dev + ``` + - This will set the working branch to ```yourname-dev```. To check, run the following command: + ```bash + git branch + ``` +5. Review the refactored engineering logic from the notebook at ```feature_engineering.py``` module under the ```data_engineering``` folder. + - The module performs the following: + - Accepts the following parameters: + - ```input_folder```: path to a folder for input data. The value for local test run is ```data``` + - ```prep_data```: path to a folder for output data. The value for local test run is ```data``` + - ```public_holiday_file_name```: name of the public holiday file. The value for local test run is ```holidays.parquet``` + - ```weather_file_name```: name of the weather raw file.It's ```weather.parquet``` + - ```nyc_file_name```: name of the newyork taxi raw file. It's ```green_taxi.parquet``` + - Performs data transformation, data merging and feature engineering logics + - Splits the data into train and test sets where test_size is 20% + - Writes the output data files to output folder + > Action Item: Run the following code snippet. + ```bash + python core/data_engineering/feature_engineering.py \ + --input_folder data \ + --prep_data data \ + --public_holiday_file_name holidays.parquet \ + --weather_file_name weather.parquet \ + --nyc_file_name green_taxi.parquet +5. Review the refactored ML training logic at ```ml_training.py``` module under training folder. + - The module performs the following: + - Accepts the following parameters: + - ```prep_data```: path to a folder for input data. The value for local test run is ```data``` + - ```input_file_name```: name of the input train data file. The value for local test run is ```final_df.parquet``` + - ```model_folder```: path to a output folder to save trained model.The value for local test run is ```data``` + - Splits input train data into train and validation dataset, perform training + - Prints out MAPE, R2 and RMSE metrics + - Writes the train model file to output folder + > Action Item: Run the following code snippet. + ```bash + python core/training/ml_training.py \ + --prep_data data \ + --input_file_name final_df.parquet \ + --model_folder data +6. Review the refactored ML training logic at ```ml_evaluating.py``` module under evaluating folder. + - The module performs the following: + - Accepts the following parameters: + - ```prep_data```: path to a folder for test input data.The value for local test run is ```data```. + - ```input_file_name```: name of the input test data file. The value for local test run is ```test_df.parquet```. + - ```model_folder```: path to a model folder.The value for local test run is ```data``` + - Loads the model + - Scores the model on input test data, print out MAPE, R2 and RMSE metrics + > Action Item: Run the following code snippet. + ```bash + python core/evaluating/ml_evaluating.py \ + --prep_data data \ + --input_file_name test_df.parquet + +## Success criteria +- Feature engineering module: + - Data is processed correctly and output to a folder as final_df.parquet and test_df.parquet files and ready to be ML trained +- ML training module + - Perform ML training and print out MAPE, R2 and RMSE metrics from input datasets + - Produce the model at the output location +- ML evaluating module + - Perform ML training and print out MAPE, R2 and RMSE metrics from an input dataset and output a model file + +## [Go to Part 2](part_2.md) diff --git a/src/workshop/documents/part_2.md b/src/workshop/documents/part_2.md index dab9fc6a..9859aefe 100644 --- a/src/workshop/documents/part_2.md +++ b/src/workshop/documents/part_2.md @@ -1,102 +1,102 @@ - -# Part 2: Use cloud scale compute to run, deploy and manage ML experiment with Azure ML - -## Pre-requisites -- Complete [Part 0](part_0.md), [Part 1](part_1.md) -- Run each module feature_engineering, ml_training and evaluating successfully in local mode -- Have Azure ML workspace setup with a Compute Cluster named ```cpu-cluster``` - -## Summary -After successfully restructuring the jupyter notebook and run modules locally, your team wants to leverage Azure cloud to run the experiment at scale. -They also want to take advantage of experiment tracking and model management capabilities in Azure ML to keep track of experiment. -Finally, the team wants to deploy the model as a rest endpoint for real time inferencing and experience the option of deploying it as batch inferencing. -To accomplish these goals, you will perform the following: -- Run feature_engineering module as a job in Azure AML -- Run ml_training module as a job in Azure ML and observe the experiment metrics -- Run evaluating module as a job in Azure ML and observe how the model can be registered to Azure ML model's repo -- Run the three modules together as a pipeline -- Deploy and test the produced ML model as an API using Azure Managed Online Endpoint - - -## Steps -1. Go to the workshop folder. - > Action Item: Run the following code snippet. - ```bash - cd src/workshop - ``` -2. Set defaults values to configure your resource group and workspace. - > Action Item: Run the following code snippet. - ```bash - az configure --defaults group=YOUR_RESOURCE_GROUP workspace=YOUR_WORKSPACE - ``` - -3. Run the ```feature_engineering.py``` module under the ```data_engineering``` folder by following the steps below: - > Action Items: - > - Run the following code snippet: - ```bash - az ml job create -f core/data_engineering/feature_engineering.yml - ``` - > - Go to Azure ML Studio and locate the run detail for this experiment. - -4. Run the ```ml_training.py``` module under the ```training``` folder by following the steps below: - > Action Items: - > - Run the following code snippet: - ```bash - az ml job create -f core/training/ml_training.yml - ``` - > - Go to Azure ML Studio and locate the run detail for this experiment. - -5. Run the ```ml_evaluating.py``` module under the ```evaluating``` folder by following the steps below: - > Action Items: - > - Run the following code snippet: - - ```bash - az ml job create -f core/evaluating/ml_evaluating.yml - ``` - > - Go to Azure ML Studio and locate the run detail for this experiment. Observe the ML metrics and how the model was logged to Azure ML's model registry. - -6. Create a pipeline that runs the feature_engineering, training and evaluation in one workflow. - > Action Items: Run the pipeline, by running the following code snippet. - - ```bash - az ml job create -f core/pipelines/training_pipeline.yml - ``` - > - Go to the run detail at Azure ML studio and observe the relationship graph among the modules. (See chart below as well.) - -7. Discuss this question: Why should we run the modules both individually and together in a pipeline? - -8. Deploy to Azure ML Managed Online Endpoint by following the steps below: - > Action Items: - > - Update the ```endpoint.yml``` file and ```deployment.yml``` by updating the name of the endpoint (should be a unique name) - > - Create your endpoint - ```bash - az ml online-endpoint create --file core/scoring/endpoint.yml - ``` - > - Create a green deployment - ```bash - az ml online-deployment create --file core/scoring/deployment.yml - ``` - > - Test the deployed service with mock-up data from scoring_test_request.json - ```bash - az ml online-endpoint invoke -n YOUR_ENDPOINT_NAME --deployment green --request-file core/scoring/scoring_test_request.json - ``` - > - Observe the returned scores from the endpoint evaluation. - -### The entire training pipeline is illustrated with this diagram -![training_pipeline](images/training_pipeline.png) - -## Success criteria -- Run the modules individually in Azure -- Capture metrics and models in ml_training and ml_evaluating modules -- Run three modules together in a pipeline -- Model is deployed successfully to managed endpoint. -- Testing is successful - -## Reference materials -- [Azure ML CLI v2 tutorial](https://docs.microsoft.com/en-us/learn/paths/train-models-azure-machine-learning-cli-v2/) -- [Azure ML CLI single job examples](https://github.com/Azure/azureml-examples/tree/main/cli/jobs/single-step) -- [Azure ML CLI pipeline examples](https://github.com/Azure/azureml-examples/tree/main/cli/jobs/pipelines) -- [Deploy to managed online endpoint](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-managed-online-endpoints) -- [Deploy to batch endpoint](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-batch-endpoint) - -## [Go to Part 3](part_3.md) + +# Part 2: Use cloud scale compute to run, deploy and manage ML experiment with Azure ML + +## Pre-requisites +- Complete [Part 0](part_0.md), [Part 1](part_1.md) +- Run each module feature_engineering, ml_training and evaluating successfully in local mode +- Have Azure ML workspace setup with a Compute Cluster named ```cpu-cluster``` + +## Summary +After successfully restructuring the jupyter notebook and run modules locally, your team wants to leverage Azure cloud to run the experiment at scale. +They also want to take advantage of experiment tracking and model management capabilities in Azure ML to keep track of experiment. +Finally, the team wants to deploy the model as a rest endpoint for real time inferencing and experience the option of deploying it as batch inferencing. +To accomplish these goals, you will perform the following: +- Run feature_engineering module as a job in Azure AML +- Run ml_training module as a job in Azure ML and observe the experiment metrics +- Run evaluating module as a job in Azure ML and observe how the model can be registered to Azure ML model's repo +- Run the three modules together as a pipeline +- Deploy and test the produced ML model as an API using Azure Managed Online Endpoint + + +## Steps +1. Go to the workshop folder. + > Action Item: Run the following code snippet. + ```bash + cd src/workshop + ``` +2. Set defaults values to configure your resource group and workspace. + > Action Item: Run the following code snippet. + ```bash + az configure --defaults group=YOUR_RESOURCE_GROUP workspace=YOUR_WORKSPACE + ``` + +3. Run the ```feature_engineering.py``` module under the ```data_engineering``` folder by following the steps below: + > Action Items: + > - Run the following code snippet: + ```bash + az ml job create -f core/data_engineering/feature_engineering.yml + ``` + > - Go to Azure ML Studio and locate the run detail for this experiment. + +4. Run the ```ml_training.py``` module under the ```training``` folder by following the steps below: + > Action Items: + > - Run the following code snippet: + ```bash + az ml job create -f core/training/ml_training.yml + ``` + > - Go to Azure ML Studio and locate the run detail for this experiment. + +5. Run the ```ml_evaluating.py``` module under the ```evaluating``` folder by following the steps below: + > Action Items: + > - Run the following code snippet: + + ```bash + az ml job create -f core/evaluating/ml_evaluating.yml + ``` + > - Go to Azure ML Studio and locate the run detail for this experiment. Observe the ML metrics and how the model was logged to Azure ML's model registry. + +6. Create a pipeline that runs the feature_engineering, training and evaluation in one workflow. + > Action Items: Run the pipeline, by running the following code snippet. + + ```bash + az ml job create -f core/pipelines/training_pipeline.yml + ``` + > - Go to the run detail at Azure ML studio and observe the relationship graph among the modules. (See chart below as well.) + +7. Discuss this question: Why should we run the modules both individually and together in a pipeline? + +8. Deploy to Azure ML Managed Online Endpoint by following the steps below: + > Action Items: + > - Update the ```endpoint.yml``` file and ```deployment.yml``` by updating the name of the endpoint (should be a unique name) + > - Create your endpoint + ```bash + az ml online-endpoint create --file core/scoring/endpoint.yml + ``` + > - Create a green deployment + ```bash + az ml online-deployment create --file core/scoring/deployment.yml + ``` + > - Test the deployed service with mock-up data from scoring_test_request.json + ```bash + az ml online-endpoint invoke -n YOUR_ENDPOINT_NAME --deployment green --request-file core/scoring/scoring_test_request.json + ``` + > - Observe the returned scores from the endpoint evaluation. + +### The entire training pipeline is illustrated with this diagram +![training_pipeline](images/training_pipeline.png) + +## Success criteria +- Run the modules individually in Azure +- Capture metrics and models in ml_training and ml_evaluating modules +- Run three modules together in a pipeline +- Model is deployed successfully to managed endpoint. +- Testing is successful + +## Reference materials +- [Azure ML CLI v2 tutorial](https://docs.microsoft.com/en-us/learn/paths/train-models-azure-machine-learning-cli-v2/) +- [Azure ML CLI single job examples](https://github.com/Azure/azureml-examples/tree/main/cli/jobs/single-step) +- [Azure ML CLI pipeline examples](https://github.com/Azure/azureml-examples/tree/main/cli/jobs/pipelines) +- [Deploy to managed online endpoint](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-managed-online-endpoints) +- [Deploy to batch endpoint](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-batch-endpoint) + +## [Go to Part 3](part_3.md) diff --git a/src/workshop/documents/part_3.md b/src/workshop/documents/part_3.md index 33a5bb34..cea08553 100644 --- a/src/workshop/documents/part_3.md +++ b/src/workshop/documents/part_3.md @@ -1,75 +1,75 @@ - -# Part 3: Use GitHub for Version Control and Automation - -## Pre-requisites -- Complete [Part 0](part_0.md), [Part 1](part_1.md), [Part 2](part_2.md) - -## Summary -Your team wants to learn how to automate and orchestrate common tasks such as environment setup, training, testing using GitHub Actions. To accomplish this, the following steps will be performed: -- Setup a centralized version control to keep track of project code and manage different feature development tracks and releases -- Learn how to automate and orchestrate common tasks such as environment setup, training, testing by setting up a unit test workflow to run when code is updated in your branch - -## Steps -1. Move to your dev branch you created in step 1 if you are not already there. - - > Action Items: - > - Navigate to the repo if not already there by running ```cd PATH_TO_REPO``` with the proper path to the cloned location. - > - Run following command to check out your "yourname-dev" - - ```bash - git checkout yourname-dev - ``` - -2. Create an automated unit test task that will be triggered by pushing the code to your development/feature branch. Let's use the ```Feature_Engineering``` module as the automated unit test to run to make sure the module performs correctly. - - > Action Items: Update the `workshop_unit_test.yml` file with your secret credentials. Replace the resource group, workspace and location with your specific details. - > - Locate the file named `workshop_unit_test.yml` in the `.github/workflows` folder - > - Make the following updates to the file: - > - Update the secret name by replacing the ```AZURE_SERVICE_PRINCIPAL``` to match the GitHub secret name for your Service Principal that was created in Part 0. (If you followed the naming convention in part 0, there is no need to update this as your secret name should be ```AZURE_SERVICE_PRINCIPAL```.) - > - Update `GROUP`, `NAME`, and `LOCATION` with the specific names of your resource group, workspace, and location created in Part 0. - -3. Next, review the contents in the ```workshop_unit_test.yml``` file to understand the steps and how it is being triggered. - - - Review the trigger defined in the `on:` section to see how this workflow is being run automatically - - The `workflow_dispatch` allows the workflow to be run manually which can be useful when testing. - - The remaining lines highlight what is going to automatically trigger the workflow. It is being triggered on a push to any branch that is not `main` or `integration`. The changes in the push are also filtered to only include changes made to the `feature_engineering` module. - - Review the job starting at the `jobs:` section that has been created already and does the following steps: - - Checks out the repo - - Logs into Azure - - Creates an AML job to run feature engineering module using the [custom action](../../../.github/actions/aml-job-create/action.yaml) and the existing [feature engineering job file](../core/data_engineering/feature_engineering.yml) - -4. Now that the necessary changes have been made, the changes can be pushed to your feature branch which will trigger the feature_engineering_unit_test workflow. - - > Action Items: - > - Run the following commands in sequence to stage changes, commit them, and then push them to your repo: - 1. ```bash - git status - ``` - 2. ```bash - git add . - ``` - 3. ```bash - git commit -am "configurations update" - ``` - 4. ```bash - git push origin yourname-dev - ``` - > Note: `git status` shows the files that have been modified. It is useful for seeing the latest status of the files, but isn't necessary to commit changes. - - > - Check to see if the workflow was properly triggered by going to your github repo and selecting the Actions tab. - -## The CI CD Workflow is shown below: -![pipeline](images/part3cicd.png) - -## Success criteria -- A feature or development branch was created to track your changes -- Trigger was created on the workflow file ```workshop_unit_test.yml``` to run on a push to your feature branch -- Understand the additional updates that were made to ```feature_engineering.yml``` file for it to use your secrets and AML resources -- Workflow was successfully triggered by pushing changes to your feature branch - -## Reference materials -- [GitHub Actions](https://github.com/features/actions) -- [GitHub Actions Workflow Triggers](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows) - - -## [Go to Part 4](part_4.md) + +# Part 3: Use GitHub for Version Control and Automation + +## Pre-requisites +- Complete [Part 0](part_0.md), [Part 1](part_1.md), [Part 2](part_2.md) + +## Summary +Your team wants to learn how to automate and orchestrate common tasks such as environment setup, training, testing using GitHub Actions. To accomplish this, the following steps will be performed: +- Setup a centralized version control to keep track of project code and manage different feature development tracks and releases +- Learn how to automate and orchestrate common tasks such as environment setup, training, testing by setting up a unit test workflow to run when code is updated in your branch + +## Steps +1. Move to your dev branch you created in step 1 if you are not already there. + + > Action Items: + > - Navigate to the repo if not already there by running ```cd PATH_TO_REPO``` with the proper path to the cloned location. + > - Run following command to check out your "yourname-dev" + + ```bash + git checkout yourname-dev + ``` + +2. Create an automated unit test task that will be triggered by pushing the code to your development/feature branch. Let's use the ```Feature_Engineering``` module as the automated unit test to run to make sure the module performs correctly. + + > Action Items: Update the `workshop_unit_test.yml` file with your secret credentials. Replace the resource group, workspace and location with your specific details. + > - Locate the file named `workshop_unit_test.yml` in the `.github/workflows` folder + > - Make the following updates to the file: + > - Update the secret name by replacing the ```AZURE_SERVICE_PRINCIPAL``` to match the GitHub secret name for your Service Principal that was created in Part 0. (If you followed the naming convention in part 0, there is no need to update this as your secret name should be ```AZURE_SERVICE_PRINCIPAL```.) + > - Update `GROUP`, `NAME`, and `LOCATION` with the specific names of your resource group, workspace, and location created in Part 0. + +3. Next, review the contents in the ```workshop_unit_test.yml``` file to understand the steps and how it is being triggered. + + - Review the trigger defined in the `on:` section to see how this workflow is being run automatically + - The `workflow_dispatch` allows the workflow to be run manually which can be useful when testing. + - The remaining lines highlight what is going to automatically trigger the workflow. It is being triggered on a push to any branch that is not `main` or `integration`. The changes in the push are also filtered to only include changes made to the `feature_engineering` module. + - Review the job starting at the `jobs:` section that has been created already and does the following steps: + - Checks out the repo + - Logs into Azure + - Creates an AML job to run feature engineering module using the [custom action](../../../.github/actions/aml-job-create/action.yaml) and the existing [feature engineering job file](../core/data_engineering/feature_engineering.yml) + +4. Now that the necessary changes have been made, the changes can be pushed to your feature branch which will trigger the feature_engineering_unit_test workflow. + + > Action Items: + > - Run the following commands in sequence to stage changes, commit them, and then push them to your repo: + 1. ```bash + git status + ``` + 2. ```bash + git add . + ``` + 3. ```bash + git commit -am "configurations update" + ``` + 4. ```bash + git push origin yourname-dev + ``` + > Note: `git status` shows the files that have been modified. It is useful for seeing the latest status of the files, but isn't necessary to commit changes. + + > - Check to see if the workflow was properly triggered by going to your github repo and selecting the Actions tab. + +## The CI CD Workflow is shown below: +![pipeline](images/part3cicd.png) + +## Success criteria +- A feature or development branch was created to track your changes +- Trigger was created on the workflow file ```workshop_unit_test.yml``` to run on a push to your feature branch +- Understand the additional updates that were made to ```feature_engineering.yml``` file for it to use your secrets and AML resources +- Workflow was successfully triggered by pushing changes to your feature branch + +## Reference materials +- [GitHub Actions](https://github.com/features/actions) +- [GitHub Actions Workflow Triggers](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows) + + +## [Go to Part 4](part_4.md) diff --git a/src/workshop/documents/part_4.md b/src/workshop/documents/part_4.md index ce8a219b..e2d44d8c 100644 --- a/src/workshop/documents/part_4.md +++ b/src/workshop/documents/part_4.md @@ -1,103 +1,103 @@ -# Part 4: Continuous Integration (CI) - -## Pre-requisites -- Complete [Part 0](part_0.md), [Part 1](part_1.md), [Part 2](part_2.md), [Part 3](part_3.md) - -## Summary -After learning about how GitHub can be leveraged for MLOps, your team decides to start by automating the model training and evaluation process with a CI pipeline. Continuous Integration (CI) is the process of developing, testing, integrating, and evaluating new features in a staging environment where they are ready for deployment and release. - -## Steps: - -1. Locate the CI pipeline template under ```.github/workflows/workshop_ci.yml``` and add all the needed information for resource group name, workspace name, location and secrets for Azure and Github. They are all tagged with ```#setup```. - - > Action Item: Update resource group name, workspace name, location, Azure Secret and Github Secret inside workshop_ci.yml file. - -2. Now Let's consider a common scenario in a ML development team. One of the team members is going to work on a new feature (examples can be changes to feature engineering, hyper-parameter selection, type of the model, etc). For this work, a common pattern is to first fork and clone the repository on your local machine (which you already have done in Step 0). Then you need to switch to the ```yourname-dev``` local branch which you created in step 3. - - > Action Item: Run the following command to switch to ```yourname-dev``` branch - ```bash - git checkout yourname-dev - ``` - This takes you to yourname-dev branch, so your current working branch is set to yourname-dev. - - > Action Item: Run the following command to ensure you are in the correct branch. - ```bash - git branch - ``` - > Note: Hopefully "yourname-dev" branch is colored green with a * next to it. - -3. In this step we want to make some changes to our ML code, locate and open the following file: ```/src/workshop/core/training/ml_training.py``` - - >Action Item: Update `ml_training.py`, you can search for #setup and modify `alpha` to: `model = Ridge(alpha=100)` - - The default for the model is set to 100,000. By updating alpha we think it will improve the model performance, let's find out! Make sure to save the changes to the file. Now we want to commit these changes to the local branch and push them to our github repository. This will update the remote github branch on the repository. - -4. Run following commands in sequence (one by one) to stage changes, commit them and then push them to your repo. Git status show the files that have been modified. It's a useful command to know what's the latest status of the files. - - >Action Items: Run the following commands sequentially: - ```bash - git status - ``` - ```bash - git add . - ``` - ```bash - git commit -am "a short summary of changes made- put your own comments here" - ``` - ```bash - git push origin yourname-dev - ``` -5. At this point you have made some changes to your code and have pushed the changes to your branch on the repository. In order for us to make these changes permanent and take it eventually to deployment and production, we need to place these changes in the "integration" branch. - - >Action Items: - >- Go to your browser and go to your repository. - >- Click on "pull requests" tab and Click on "New pull request". - > - >- Set the `base` branch to `integration` and the `compare` branch to `yourname-dev`. - >- IMPORTANT NOTE: Make sure the integration branch you choose as the base is pointing to your `forked` repository and NOT the Microsoft MLOpsTemplate repository. - >- Click on "Create pull request". - >- Click on "Merge pull request". - - This creates a pull request to the integration branch and merges it. As a reminder, integration branch is a branch which is as up to date as the main branch but we use it to evaluate the new feature. Here we made some changes to the model, and we want to make sure the new model passes the evaluation. If not,it will stop us from going to the CD process and making changes to the main branch where our production code lives. - -6. The merge to the integration branch triggers the workshop_ci workflow. Click on the Actions tab on your repository and you will see CI workflow running after a few minutes. Click and examine all the steps, note that the CI Workflow is running following the steps in the ```workshop_ci.yml``` file which you located earlier. Note that in the first few lines of this file we have defined the workflow to be triggered when a pull request is merged in the "integration" branch. - - The CI workflow has multiple steps, including setting up python version, installing libraries needed, logging in to Azure and running the training model pipeline and evaluating the model. As a part of this workflow, the updated model from our current changes is compared to our best previous model and if it performs better it passes the evaluation step (more details below). - - You can check out different steps of the training pipeline under: ```/src/workshop/pipelines/training_pipeline.yml```. - - >Note: At this point, it takes about 10 minutes for the pipeline to run. - - If all steps pass (you can check the status under the actions in the repository), a new pull request is made to the main branch. If the workflow fails, there could be a few different reasons, you can open the workflow steps on the actions tab of the repository and examine it. Most likely if it fails in this case is due to the evaluation part, where our new model performs worse than our best previous model and doesn't pass the evaluation step and the whole workflow fails. To resolve that please read the optional reading section at the bottom of this page. - - >Note: By design the CI workflow will fail if the new updated model does not perform better than our best previous model and that is expected. The CI workflow prevents promoting a new model that does not pass the evaluation step. - - -> IMPORTANT NOTE: On success on the CI workflow, a Pull Request (PR) to main is created from the integration branch. This is by design as per the definition of the CI workflow (see last step in the workflow yml file). -> -> What you will notice will happen, is that another workflow, the CD workflow, is triggered (you could go to 'Actions' in github and see that 'workshop-cd' appeared and is running). We will cover this workflow in the next section. Its trigger is based on having a Pull Request open to main, which is how we automate the CI -> CD chain. -> -> At this point the CD workflow will fail, and this is expected, because we haven't configured it yet (the yaml at this point is incorrect and pointing to incorrect Azure resources for instance). -> -> Another important observation: if you go to the Pull Request, you can see that you'd be allowed to merge the Pull Request to main, even though 'workshop-cd' fails. DO NOT MERGE IT, instead CLOSE IT, but observe that it is inappropriate to have the option to close the PR. -> ->We definitely do not want to allow moving some code to 'main' if something in the integration branch is broken (at this point, the workflow itself is broken, but it could be anything, like the scoring script). Take note of that, as we will setup in the next section a branch protection system that will prevent such a merge to be possible unless the CD workflow is succesful. - -> OPTIONAL READING: For the evaluation and comparison of the current model with our best previous model, we have included some code in the following script: ```/src/workshop/core/evaluating/ml_evaluating.py```. Note that on line 85 of the script we are comparing the R-square of the current model with our best previous model in order to decide if we want to allow any changes to the model and main branch. You might want to edit this and relax it a little bit in order for the evaluation step to pass if you already have a really good model registered. Note that you can change the evaluation metrics based on your actual use case in the future. - -## Success criteria -- Trigger CI workflow when a pull request is merged to the integration branch -- Successfully run the CI workflow which also includes the AML pipeline -- Create a Pull Request to the main branch if new code results in higher performing model - -## Reference materials - -- [GitHub Actions](https://github.com/features/actions) -- [GitHub Personal Access Token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token#creating-a-token) -- [GitHub Actions Workflow Triggers](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows) -- [Azure ML CLI v2](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli) -- [Azure ML CLI v2 Examples](https://github.com/Azure/azureml-examples/tree/main/cli) - - -## [Go to Part 5](part_5.md) - +# Part 4: Continuous Integration (CI) + +## Pre-requisites +- Complete [Part 0](part_0.md), [Part 1](part_1.md), [Part 2](part_2.md), [Part 3](part_3.md) + +## Summary +After learning about how GitHub can be leveraged for MLOps, your team decides to start by automating the model training and evaluation process with a CI pipeline. Continuous Integration (CI) is the process of developing, testing, integrating, and evaluating new features in a staging environment where they are ready for deployment and release. + +## Steps: + +1. Locate the CI pipeline template under ```.github/workflows/workshop_ci.yml``` and add all the needed information for resource group name, workspace name, location and secrets for Azure and Github. They are all tagged with ```#setup```. + + > Action Item: Update resource group name, workspace name, location, Azure Secret and Github Secret inside workshop_ci.yml file. + +2. Now Let's consider a common scenario in a ML development team. One of the team members is going to work on a new feature (examples can be changes to feature engineering, hyper-parameter selection, type of the model, etc). For this work, a common pattern is to first fork and clone the repository on your local machine (which you already have done in Step 0). Then you need to switch to the ```yourname-dev``` local branch which you created in step 3. + + > Action Item: Run the following command to switch to ```yourname-dev``` branch + ```bash + git checkout yourname-dev + ``` + This takes you to yourname-dev branch, so your current working branch is set to yourname-dev. + + > Action Item: Run the following command to ensure you are in the correct branch. + ```bash + git branch + ``` + > Note: Hopefully "yourname-dev" branch is colored green with a * next to it. + +3. In this step we want to make some changes to our ML code, locate and open the following file: ```/src/workshop/core/training/ml_training.py``` + + >Action Item: Update `ml_training.py`, you can search for #setup and modify `alpha` to: `model = Ridge(alpha=100)` + + The default for the model is set to 100,000. By updating alpha we think it will improve the model performance, let's find out! Make sure to save the changes to the file. Now we want to commit these changes to the local branch and push them to our github repository. This will update the remote github branch on the repository. + +4. Run following commands in sequence (one by one) to stage changes, commit them and then push them to your repo. Git status show the files that have been modified. It's a useful command to know what's the latest status of the files. + + >Action Items: Run the following commands sequentially: + ```bash + git status + ``` + ```bash + git add . + ``` + ```bash + git commit -am "a short summary of changes made- put your own comments here" + ``` + ```bash + git push origin yourname-dev + ``` +5. At this point you have made some changes to your code and have pushed the changes to your branch on the repository. In order for us to make these changes permanent and take it eventually to deployment and production, we need to place these changes in the "integration" branch. + + >Action Items: + >- Go to your browser and go to your repository. + >- Click on "pull requests" tab and Click on "New pull request". + > + >- Set the `base` branch to `integration` and the `compare` branch to `yourname-dev`. + >- IMPORTANT NOTE: Make sure the integration branch you choose as the base is pointing to your `forked` repository and NOT the Microsoft MLOpsTemplate repository. + >- Click on "Create pull request". + >- Click on "Merge pull request". + + This creates a pull request to the integration branch and merges it. As a reminder, integration branch is a branch which is as up to date as the main branch but we use it to evaluate the new feature. Here we made some changes to the model, and we want to make sure the new model passes the evaluation. If not,it will stop us from going to the CD process and making changes to the main branch where our production code lives. + +6. The merge to the integration branch triggers the workshop_ci workflow. Click on the Actions tab on your repository and you will see CI workflow running after a few minutes. Click and examine all the steps, note that the CI Workflow is running following the steps in the ```workshop_ci.yml``` file which you located earlier. Note that in the first few lines of this file we have defined the workflow to be triggered when a pull request is merged in the "integration" branch. + + The CI workflow has multiple steps, including setting up python version, installing libraries needed, logging in to Azure and running the training model pipeline and evaluating the model. As a part of this workflow, the updated model from our current changes is compared to our best previous model and if it performs better it passes the evaluation step (more details below). + + You can check out different steps of the training pipeline under: ```/src/workshop/pipelines/training_pipeline.yml```. + + >Note: At this point, it takes about 10 minutes for the pipeline to run. + + If all steps pass (you can check the status under the actions in the repository), a new pull request is made to the main branch. If the workflow fails, there could be a few different reasons, you can open the workflow steps on the actions tab of the repository and examine it. Most likely if it fails in this case is due to the evaluation part, where our new model performs worse than our best previous model and doesn't pass the evaluation step and the whole workflow fails. To resolve that please read the optional reading section at the bottom of this page. + + >Note: By design the CI workflow will fail if the new updated model does not perform better than our best previous model and that is expected. The CI workflow prevents promoting a new model that does not pass the evaluation step. + + +> IMPORTANT NOTE: On success on the CI workflow, a Pull Request (PR) to main is created from the integration branch. This is by design as per the definition of the CI workflow (see last step in the workflow yml file). +> +> What you will notice will happen, is that another workflow, the CD workflow, is triggered (you could go to 'Actions' in github and see that 'workshop-cd' appeared and is running). We will cover this workflow in the next section. Its trigger is based on having a Pull Request open to main, which is how we automate the CI -> CD chain. +> +> At this point the CD workflow will fail, and this is expected, because we haven't configured it yet (the yaml at this point is incorrect and pointing to incorrect Azure resources for instance). +> +> Another important observation: if you go to the Pull Request, you can see that you'd be allowed to merge the Pull Request to main, even though 'workshop-cd' fails. DO NOT MERGE IT, instead CLOSE IT, but observe that it is inappropriate to have the option to close the PR. +> +>We definitely do not want to allow moving some code to 'main' if something in the integration branch is broken (at this point, the workflow itself is broken, but it could be anything, like the scoring script). Take note of that, as we will setup in the next section a branch protection system that will prevent such a merge to be possible unless the CD workflow is succesful. + +> OPTIONAL READING: For the evaluation and comparison of the current model with our best previous model, we have included some code in the following script: ```/src/workshop/core/evaluating/ml_evaluating.py```. Note that on line 85 of the script we are comparing the R-square of the current model with our best previous model in order to decide if we want to allow any changes to the model and main branch. You might want to edit this and relax it a little bit in order for the evaluation step to pass if you already have a really good model registered. Note that you can change the evaluation metrics based on your actual use case in the future. + +## Success criteria +- Trigger CI workflow when a pull request is merged to the integration branch +- Successfully run the CI workflow which also includes the AML pipeline +- Create a Pull Request to the main branch if new code results in higher performing model + +## Reference materials + +- [GitHub Actions](https://github.com/features/actions) +- [GitHub Personal Access Token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token#creating-a-token) +- [GitHub Actions Workflow Triggers](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows) +- [Azure ML CLI v2](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli) +- [Azure ML CLI v2 Examples](https://github.com/Azure/azureml-examples/tree/main/cli) + + +## [Go to Part 5](part_5.md) + diff --git a/src/workshop/documents/part_5.md b/src/workshop/documents/part_5.md index 6eded09e..118e4c04 100644 --- a/src/workshop/documents/part_5.md +++ b/src/workshop/documents/part_5.md @@ -1,113 +1,113 @@ -# Part 5: Continuous Deployment (CD) - -## Pre-requisites -- Complete [Part 0](part_0.md), [Part 1](part_1.md), [Part 2](part_2.md), [Part 3](part_3.md) and [Part 4](part_4.md) - -## Summary - -After a successful run of the CI pipeline, your team is looking to complete the process with a CD pipeline that will handle the deployment of the model without introducing any downtime in production (otherwise termed as a "hot swap"). - -The goal of this section is to get a fully functional CD pipeline that will: - -1. Trigger based on creation of a Pull Request (PR) to main. -2. Login to Azure using a Service Principal to be able to leverage the Azure ML CLI commands in your workflow. -3. Create a model API endpoint (webservice) using an Azure ML Managed Endpoint and deploy the model to the endpoint into one of the two deployment slots (blue/green slots, which will switch staging/production roles). - - Test the deployment to the endpoint of the new model. - - On success of test, swap the deployment to accept 100% of the service endpoint traffic (and therefore become 'production'). -4. Add a Branch Protection rule in GitHub. - -## Steps - -1. You define triggers as part of a GitHub Actions workflow. The triggers for this workshop have already been defined in `.github/workflows/workshop_cd.yml`. Please review this file to understand how we've establised a trigger mechanism to enable a deployment of code that has succesfully passed CI, and is ready to be deployed to production. - - Review the key elements of the trigger section: - - - 'workflow_dispatch': this enables to run the CD pipeline on demand from the GitHub UI as this will greatly facilitate testing. In practice you would eventually remove this trigger type and fully depend on the rest of the automation. - - - 'pull_request': defines the trigger to kick in when an integration is open to main, and a specific set of files have been modified in the pull request (any code change, or CD definition change that would justify pushing a new deployment live). See [Events that Trigger Workflows](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows) for details around the trigger syntax and the type of controls available to build your custom trigger rules. - -2. The CD workflow will rely heavily on the Azure CLI to control the infrastructure and implement the automation of the model deployments. Therefore, we need to setup this workflow to login to Azure via a Service Principal to be able to leverage the Azure CLI. - - > Action Items: - > 1. Open up the `workflow_cd.yml` file in your repo (.github/workflow location) - > 2. Update the 'creds: ${{ secrets...' section in this file to setup your secret name. Follow the instructions in this file annotated with #setup. - - > Note: Please refer to [Use the Azure login action with a service principal secret](https://docs.microsoft.com/en-us/azure/developer/github/connect-from-azure?tabs=azure-portal%2Cwindows#use-the-azure-login-action-with-a-service-principal-secret) to create the proper Azure Credentials if you haven't done so already (you should have already defined such secret to complete the CI part of the workshop, i.e. [Part 4](part_4.md)). - -3. We will now configure our Azure ML deployments, and the GitHub workflow which will automate these deployments. - - - Two files control your Azure ML deployments: - - `/core/scoring/endpoint.yml`: this is your endpoint. - Think of this as a virtual load balancer to the actual 'deployments' (actual web services hosting your models,i.e. deployments) - - `/core/scoring/deployment.yml`: this defines an actual deployment to an endpoint. - - You can have as many deployments as you want behind an endpoint. The endpoint traffic routing enables you to control which parts of the traffic to the endpoint gets routed to which deployment. In this workshop, we take the blue/green approach where we'll have 2 deployments (named green and blue respectively), which will take turn playing the role of production/staging. We only have one deployment file define though, as we automatically override the name of the deployment as part of a custom GitHub action which we'll review later in this section. - - > Action Items: - > 1. Edit `endpoint.yml` file to setup the name of your endpoint. This name needs to be unique within the region you are deploying into as the endpoint name is part of the endpoint URI. Look for #setup in that file. - > 2. Edit `deployment.yml` to setup the name of the endpoint this deployment belongs to to the same name you defined just above. Look for #setup in that file. - - Now let's configure the GitHub Actions workflow file that controls the CD process located at `.github/workflows/workshop_cd.yml` - - > Action Item: - >- Edit `workshop_cd.yml` to setup your Azure resource group name and Azure ML workspace name which are being passed as parameters to a set of custom GitHub Actions. Look for #setup and follow the instructions in the file. - - - As you've now noticed, 3 actions control the overall CD flow at the end of the workflow definition. Let's have a look into them in more details, feel free to open their code and review how this logic has been implemented. The key considerations for each file are as follow: - - `.github/actions/aml-endpoint-deploy/action.yaml`: this action does quite a few things: - - Creates an endpoint if it doesn't exist yet using your endpoint.yml definition. - - Checks the traffic on the endpoint, which returns a list of deployments and their respective traffic. Based on that information, the action determines which deployment name to use (green or blue). The action will deploy to whichever deployment has 0% traffic (or create one if none exists yet) - - Deploys the latest version of the model (note that the code retrieves the latest version of the model automatically and ignores the version set in the deployment.yml file) to ensure we always release the latest registered model version. - - The deployment has a traffic of '0%' by design as we do not want to enable it to support traffic yet until it's been tested. - - `.github/actions/aml-endpoint-test/action.yaml`: This action is quite simple and does the following: - - Finds the deployment to test by reading the endpoint, and looking for the deployment with 0% traffic - - Tests the 0% traffic endpoint. Note that we do not have a real test it, we just check that the endpoint is 'live' but you'd be most likely checking the output against an expected response or analyzing the response with a simple python test code. Consider looking at [Test new Deployment](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-safely-rollout-managed-endpoints#test-the-new-deployment) to understand your options when it comes to testing endpoints, either via an az ml command, or by calling the endpoint URI directly but specifying a header hint to route your test request to the 0% traffic deployment. - - `.github/actions/aml-endpoint-swap/action.yaml`: This action is also quite simple and consists of two main steps: - - Read the endpoint to see which deployment is at 0% vs 100% - - Operates a traffic update operation to swap around the traffic routing and effectively enabling the latest model version to support 100% of the traffic, i.e. becoming 'production'. - - > Action Items: - > 1. Commit your configuration changes and push them up to github in your own development branch. - > 2. Go to the GitHub UI under 'Actions', and select 'workshop_cd', and trigger it to run now on your own branch. - > 3. Once triggered, click on it to open up the details and monitor its execution. - - > Note: Run this one more time at least to observe the entire flow and the 'swap' of deployments happening automatically with each green/blue swap alternating between supporting 0% of the traffic and 100% of the traffic as they get 'pushed to production'. - -4. The last step to control CD is to setup a GitHub branch protection rule to require a succesful CD run to be able to merge any code into 'main'. This important point will guarantees that the 'main' branch only accepts stable code (and therefore model as an artifact of this code) that has been succesfully rolled to production goes to 'main'. This 'main' branch is therefore always reflecting what's actually in production. - - GitHub offers up an easy way to define such policy. - - > Action Items: - > - Go to your Github repo, and click on 'Settings' - > - Click on 'Branches' under 'Code and automation' - > - Click on 'Add rule' next to the 'Branch protection rules' to create a new rule, keep all defaults and set the following: - > - Branch name pattern: main - > - Require a pull request before merging: CHECK - > - Require status checks to pass before merging: CHECK - > - Status checks that are required: type-in 'Workshop-Deployment' in the search box and select it (it should auto-complete). This name is the job name defined in the workshop_cd.yml file. - > - Click Save Changes to enable this rule on your repo. - - You can easily test this rule by creating a pull request to main from integration. - - > Action Items: - > 1. Create a pull request from integration to main (if you have no changes in integration, first commit a simple change in your own dev branch by adding a comment to the score.py script for instance), and bring this over to integration via a Pull Request from your dev branch to integration. Once the CI workflow has completed, a Pull Request from integration to main will be automatically created. - > 2. Observe the status of the Pull Request to main: it should have triggered the CD run (based on the workshop_cd.yml triggers definition), and there should be a rule that prevents merging the Pull Request until the CD workflow completes succesfully. - -## Success criteria - -- The CD pipeline runs sucessfully each time a PR request to 'main' is opened. Please test this by triggering a new CI run (which on success should generate a PR to main), or creating your own PR to main. -- Each CD run updates the deployment which is currently at 0%, and then swaps it to 100% once tested properly via the CD test custom action. - -## Reference materials - -- [GitHub Actions](https://github.com/features/actions) -- [GitHub Actions: Workflow Triggers](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows) -- [Github Actions: Use the Azure login action with a service principal secret](https://docs.microsoft.com/en-us/azure/developer/github/connect-from-azure?tabs=azure-portal%2Cwindows#use-the-azure-login-action-with-a-service-principal-secret) -- [Azure ML CLI v2](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli) -- [Azure ML CLI v2 Examples](https://github.com/Azure/azureml-examples/tree/main/cli) -- [Azure ML Managed Endpoints](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-managed-online-endpoints) -- [Azure ML Safe Rollout of Managed Endpoints](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-safely-rollout-managed-endpoints) - -## Congratulations! - -This completes this workshop. We hope you've learned a lot of key concepts and are ready to take this as a template to customize for your own needs and accelerate your ML use cases. +# Part 5: Continuous Deployment (CD) + +## Pre-requisites +- Complete [Part 0](part_0.md), [Part 1](part_1.md), [Part 2](part_2.md), [Part 3](part_3.md) and [Part 4](part_4.md) + +## Summary + +After a successful run of the CI pipeline, your team is looking to complete the process with a CD pipeline that will handle the deployment of the model without introducing any downtime in production (otherwise termed as a "hot swap"). + +The goal of this section is to get a fully functional CD pipeline that will: + +1. Trigger based on creation of a Pull Request (PR) to main. +2. Login to Azure using a Service Principal to be able to leverage the Azure ML CLI commands in your workflow. +3. Create a model API endpoint (webservice) using an Azure ML Managed Endpoint and deploy the model to the endpoint into one of the two deployment slots (blue/green slots, which will switch staging/production roles). + - Test the deployment to the endpoint of the new model. + - On success of test, swap the deployment to accept 100% of the service endpoint traffic (and therefore become 'production'). +4. Add a Branch Protection rule in GitHub. + +## Steps + +1. You define triggers as part of a GitHub Actions workflow. The triggers for this workshop have already been defined in `.github/workflows/workshop_cd.yml`. Please review this file to understand how we've establised a trigger mechanism to enable a deployment of code that has succesfully passed CI, and is ready to be deployed to production. + + Review the key elements of the trigger section: + + - 'workflow_dispatch': this enables to run the CD pipeline on demand from the GitHub UI as this will greatly facilitate testing. In practice you would eventually remove this trigger type and fully depend on the rest of the automation. + + - 'pull_request': defines the trigger to kick in when an integration is open to main, and a specific set of files have been modified in the pull request (any code change, or CD definition change that would justify pushing a new deployment live). See [Events that Trigger Workflows](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows) for details around the trigger syntax and the type of controls available to build your custom trigger rules. + +2. The CD workflow will rely heavily on the Azure CLI to control the infrastructure and implement the automation of the model deployments. Therefore, we need to setup this workflow to login to Azure via a Service Principal to be able to leverage the Azure CLI. + + > Action Items: + > 1. Open up the `workflow_cd.yml` file in your repo (.github/workflow location) + > 2. Update the 'creds: ${{ secrets...' section in this file to setup your secret name. Follow the instructions in this file annotated with #setup. + + > Note: Please refer to [Use the Azure login action with a service principal secret](https://docs.microsoft.com/en-us/azure/developer/github/connect-from-azure?tabs=azure-portal%2Cwindows#use-the-azure-login-action-with-a-service-principal-secret) to create the proper Azure Credentials if you haven't done so already (you should have already defined such secret to complete the CI part of the workshop, i.e. [Part 4](part_4.md)). + +3. We will now configure our Azure ML deployments, and the GitHub workflow which will automate these deployments. + + - Two files control your Azure ML deployments: + - `/core/scoring/endpoint.yml`: this is your endpoint. + Think of this as a virtual load balancer to the actual 'deployments' (actual web services hosting your models,i.e. deployments) + - `/core/scoring/deployment.yml`: this defines an actual deployment to an endpoint. + + You can have as many deployments as you want behind an endpoint. The endpoint traffic routing enables you to control which parts of the traffic to the endpoint gets routed to which deployment. In this workshop, we take the blue/green approach where we'll have 2 deployments (named green and blue respectively), which will take turn playing the role of production/staging. We only have one deployment file define though, as we automatically override the name of the deployment as part of a custom GitHub action which we'll review later in this section. + + > Action Items: + > 1. Edit `endpoint.yml` file to setup the name of your endpoint. This name needs to be unique within the region you are deploying into as the endpoint name is part of the endpoint URI. Look for #setup in that file. + > 2. Edit `deployment.yml` to setup the name of the endpoint this deployment belongs to to the same name you defined just above. Look for #setup in that file. + + Now let's configure the GitHub Actions workflow file that controls the CD process located at `.github/workflows/workshop_cd.yml` + + > Action Item: + >- Edit `workshop_cd.yml` to setup your Azure resource group name and Azure ML workspace name which are being passed as parameters to a set of custom GitHub Actions. Look for #setup and follow the instructions in the file. + + + As you've now noticed, 3 actions control the overall CD flow at the end of the workflow definition. Let's have a look into them in more details, feel free to open their code and review how this logic has been implemented. The key considerations for each file are as follow: + - `.github/actions/aml-endpoint-deploy/action.yaml`: this action does quite a few things: + - Creates an endpoint if it doesn't exist yet using your endpoint.yml definition. + - Checks the traffic on the endpoint, which returns a list of deployments and their respective traffic. Based on that information, the action determines which deployment name to use (green or blue). The action will deploy to whichever deployment has 0% traffic (or create one if none exists yet) + - Deploys the latest version of the model (note that the code retrieves the latest version of the model automatically and ignores the version set in the deployment.yml file) to ensure we always release the latest registered model version. + - The deployment has a traffic of '0%' by design as we do not want to enable it to support traffic yet until it's been tested. + - `.github/actions/aml-endpoint-test/action.yaml`: This action is quite simple and does the following: + - Finds the deployment to test by reading the endpoint, and looking for the deployment with 0% traffic + - Tests the 0% traffic endpoint. Note that we do not have a real test it, we just check that the endpoint is 'live' but you'd be most likely checking the output against an expected response or analyzing the response with a simple python test code. Consider looking at [Test new Deployment](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-safely-rollout-managed-endpoints#test-the-new-deployment) to understand your options when it comes to testing endpoints, either via an az ml command, or by calling the endpoint URI directly but specifying a header hint to route your test request to the 0% traffic deployment. + - `.github/actions/aml-endpoint-swap/action.yaml`: This action is also quite simple and consists of two main steps: + - Read the endpoint to see which deployment is at 0% vs 100% + - Operates a traffic update operation to swap around the traffic routing and effectively enabling the latest model version to support 100% of the traffic, i.e. becoming 'production'. + + > Action Items: + > 1. Commit your configuration changes and push them up to github in your own development branch. + > 2. Go to the GitHub UI under 'Actions', and select 'workshop_cd', and trigger it to run now on your own branch. + > 3. Once triggered, click on it to open up the details and monitor its execution. + + > Note: Run this one more time at least to observe the entire flow and the 'swap' of deployments happening automatically with each green/blue swap alternating between supporting 0% of the traffic and 100% of the traffic as they get 'pushed to production'. + +4. The last step to control CD is to setup a GitHub branch protection rule to require a succesful CD run to be able to merge any code into 'main'. This important point will guarantees that the 'main' branch only accepts stable code (and therefore model as an artifact of this code) that has been succesfully rolled to production goes to 'main'. This 'main' branch is therefore always reflecting what's actually in production. + + GitHub offers up an easy way to define such policy. + + > Action Items: + > - Go to your Github repo, and click on 'Settings' + > - Click on 'Branches' under 'Code and automation' + > - Click on 'Add rule' next to the 'Branch protection rules' to create a new rule, keep all defaults and set the following: + > - Branch name pattern: main + > - Require a pull request before merging: CHECK + > - Require status checks to pass before merging: CHECK + > - Status checks that are required: type-in 'Workshop-Deployment' in the search box and select it (it should auto-complete). This name is the job name defined in the workshop_cd.yml file. + > - Click Save Changes to enable this rule on your repo. + + You can easily test this rule by creating a pull request to main from integration. + + > Action Items: + > 1. Create a pull request from integration to main (if you have no changes in integration, first commit a simple change in your own dev branch by adding a comment to the score.py script for instance), and bring this over to integration via a Pull Request from your dev branch to integration. Once the CI workflow has completed, a Pull Request from integration to main will be automatically created. + > 2. Observe the status of the Pull Request to main: it should have triggered the CD run (based on the workshop_cd.yml triggers definition), and there should be a rule that prevents merging the Pull Request until the CD workflow completes succesfully. + +## Success criteria + +- The CD pipeline runs sucessfully each time a PR request to 'main' is opened. Please test this by triggering a new CI run (which on success should generate a PR to main), or creating your own PR to main. +- Each CD run updates the deployment which is currently at 0%, and then swaps it to 100% once tested properly via the CD test custom action. + +## Reference materials + +- [GitHub Actions](https://github.com/features/actions) +- [GitHub Actions: Workflow Triggers](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows) +- [Github Actions: Use the Azure login action with a service principal secret](https://docs.microsoft.com/en-us/azure/developer/github/connect-from-azure?tabs=azure-portal%2Cwindows#use-the-azure-login-action-with-a-service-principal-secret) +- [Azure ML CLI v2](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli) +- [Azure ML CLI v2 Examples](https://github.com/Azure/azureml-examples/tree/main/cli) +- [Azure ML Managed Endpoints](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-managed-online-endpoints) +- [Azure ML Safe Rollout of Managed Endpoints](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-safely-rollout-managed-endpoints) + +## Congratulations! + +This completes this workshop. We hope you've learned a lot of key concepts and are ready to take this as a template to customize for your own needs and accelerate your ML use cases. diff --git a/src/workshop/documents/part_tips.md b/src/workshop/documents/part_tips.md index d6b6acff..887ba456 100644 --- a/src/workshop/documents/part_tips.md +++ b/src/workshop/documents/part_tips.md @@ -1,31 +1,41 @@ -# Pre-Workshop Checklist -> Note: Review the following criteria to ensure you can complete the workshop. These are critical pieces of access to get right for a successful workshop experience. - -## Azure -1. Do you have an Azure account? - -2. Do you have a `Contributor` role for your Azure Subscription? - - If you don't, do you have a `Contributor` role for the Azure Resource Group? - > Note: If you don't, you can't run the workshop. - -3. Do you have a Service Principal? - - If you don't, do you know the Service Principal and it's information (client id, secret)? - - If you don't, can you ask your Cloud team to create the Service Principal for limited scope of a resource group? - > Note: If you don't, you can't run the workshop. - -4. Do you know who can help you to handle issues? - -5. Do you know a person from your Cloud infra/security team who can help you: - - Create Azure resources - - Grant permission - -6. Did you register 'Microsoft.MachineLearningServices' for your Azure subscription? -> Note: If you're not sure, go to the Azure Portal > Subscriptions > 'YourSubscription' > Resource providers' > Search 'Microsoft.MachineLearningServices' - -![ml_services](./images/arm100.png) - -## Github -1. Do you have a Github account? -> Note: If not, create a new account and follow the instructions in Part 0 of the workshop. - -# [Go to Part 0](./part_0.md) +# Pre-Workshop Checklist +> Note: Review the following criteria to ensure you can complete the workshop. These are critical pieces of access to get right for a successful workshop experience. + +## Azure +1. Do you have an Azure account? + +2. Do you have a `Contributor` role for your Azure Subscription? + - If you don't, do you have a `Contributor` role for the Azure Resource Group? + > Note: If you don't, you can't run the workshop. + +3. Do you have a Service Principal? + - If you don't, do you know the Service Principal and it's information (client id, secret)? + - If you don't, can you ask your Cloud team to create the Service Principal for limited scope of a resource group? + > Note: If you don't, you can't run the workshop. + +4. Do you know who can help you to handle issues? + +5. Do you know a person from your Cloud infra/security team who can help you: + - Create Azure resources + - Grant permission + +6. Did you register 'Microsoft.MachineLearningServices' for your Azure subscription? +> Note: If you're not sure, go to the Azure Portal > Subscriptions > 'YourSubscription' > Resource providers' > Search 'Microsoft.MachineLearningServices' + +![ml_services](./images/arm100.png) + +## Github +1. Do you have a Github account? +> Note: If not, create a new account and follow the instructions in Part 0 of the workshop. + + + +### Github Self-Hosted Runner +1. If AML workspace is provisioned with Private Endpoints, Github Actions and workflows will be able to connect to the workspaces. +2. You can deploy Self hosted runners in your own environment which can connect to AML workspace. +3. To do this, please [see this](https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/adding-self-hosted-runners) for more details. +4. If a self hosted runner is used, modify the files in the workflow folder as below: + + runs-on: [label, linux, X64] + +# [Go to Part 0](./part_0.md) diff --git a/src/workshop/infra/.amlignore b/src/workshop/infra/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/infra/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/infra/.amlignore.amltmp b/src/workshop/infra/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/infra/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/infra/conda.yml b/src/workshop/infra/conda.yml index 45609a19..6b604ee5 100644 --- a/src/workshop/infra/conda.yml +++ b/src/workshop/infra/conda.yml @@ -1,11 +1,11 @@ -name: workshop-online-scoring -channels: - - conda-forge -dependencies: - - python=3.8.12 - - pip=21.3.1 - - pip: - - azureml-mlflow==1.38.0 - - azureml-defaults==1.38.0 - - pandas +name: workshop-online-scoring +channels: + - conda-forge +dependencies: + - python=3.8.12 + - pip=21.3.1 + - pip: + - azureml-mlflow==1.38.0 + - azureml-defaults==1.38.0 + - pandas - scikit-learn==1.0.2 \ No newline at end of file diff --git a/src/workshop/mlops-permission.json b/src/workshop/mlops-permission.json new file mode 100644 index 00000000..fd64e00c --- /dev/null +++ b/src/workshop/mlops-permission.json @@ -0,0 +1,36 @@ +{ + "Name": "MLOps Custom", + "IsCustom": true, + "Description": "Can run pipelines against a published pipeline endpoint", + "Actions": [ + "Microsoft.MachineLearningServices/workspaces/read", + "Microsoft.MachineLearningServices/workspaces/endpoints/pipelines/read", + "Microsoft.MachineLearningServices/workspaces/metadata/artifacts/read", + "Microsoft.MachineLearningServices/workspaces/metadata/snapshots/read", + "Microsoft.MachineLearningServices/workspaces/environments/read", + "Microsoft.MachineLearningServices/workspaces/metadata/secrets/read", + "Microsoft.MachineLearningServices/workspaces/modules/read", + "Microsoft.MachineLearningServices/workspaces/components/read", + "Microsoft.MachineLearningServices/workspaces/datasets/*/read", + "Microsoft.MachineLearningServices/workspaces/datastores/read", + "Microsoft.MachineLearningServices/workspaces/environments/write", + "Microsoft.MachineLearningServices/workspaces/experiments/runs/read", + "Microsoft.MachineLearningServices/workspaces/experiments/runs/write", + "Microsoft.MachineLearningServices/workspaces/experiments/runs/submit/action", + "Microsoft.MachineLearningServices/workspaces/metadata/artifacts/write", + "Microsoft.MachineLearningServices/workspaces/metadata/snapshots/write", + "Microsoft.MachineLearningServices/workspaces/onlineEndpoints/read" + ], + "NotActions": [ + "Microsoft.MachineLearningServices/workspaces/computes/write", + "Microsoft.MachineLearningServices/workspaces/write", + "Microsoft.MachineLearningServices/workspaces/computes/delete", + "Microsoft.MachineLearningServices/workspaces/delete", + "Microsoft.MachineLearningServices/workspaces/computes/listKeys/action", + "Microsoft.MachineLearningServices/workspaces/listKeys/action", + "Microsoft.Authorization/*" + ], + "AssignableScopes": [ + "/subscriptions/3e0e14b3-7e28-4da7-97de-0f5cb324f030" + ] +} \ No newline at end of file diff --git a/src/workshop/notebooks/.amlignore b/src/workshop/notebooks/.amlignore new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/notebooks/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/notebooks/.amlignore.amltmp b/src/workshop/notebooks/.amlignore.amltmp new file mode 100644 index 00000000..a7fc4c9f --- /dev/null +++ b/src/workshop/notebooks/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/notebooks/.ipynb_aml_checkpoints/taxi-tutorial-checkpoint2023-1-15-21-55-51Z.ipynb b/src/workshop/notebooks/.ipynb_aml_checkpoints/taxi-tutorial-checkpoint2023-1-15-21-55-51Z.ipynb new file mode 100644 index 00000000..b519258e --- /dev/null +++ b/src/workshop/notebooks/.ipynb_aml_checkpoints/taxi-tutorial-checkpoint2023-1-15-21-55-51Z.ipynb @@ -0,0 +1,2898 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial: Build a regression model with Open Datasets\n", + "\n", + "In this tutorial, you leverage the convenience of Azure Open Datasets to create a regression model to predict NYC taxi fare prices. Easily download publicly available taxi, holiday and weather data to create a dataset that can train a regression model using sklearn." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.opendatasets import NycTlcGreen\n", + "import pandas as pd\n", + "import numpy as np\n", + "from datetime import datetime\n", + "from dateutil.relativedelta import relativedelta\n", + "\n", + "pd.options.mode.chained_assignment = None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download Data\n", + "Begin by downloading the NYC Taxi dataset from Azure Open Datasets. In non-Spark environments, Open Datasets only allows one month of data at a time with certain classes to avoid MemoryError with large datasets. To download 1 year of taxi data, we will fetch 2000 random samples from each month.\n", + "\n", + "Note: Open Datasets has mirroring classes for working in Spark where data size and memory are not a concern." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpviwf6gni\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=1\\part-00119-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2689-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp6e1co7l5\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=2\\part-00060-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2630-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpd5lgxojh\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=3\\part-00196-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2766-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpela340gr\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=4\\part-00121-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2691-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpe79pzv2_\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=5\\part-00044-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2614-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpyxyv_8h4\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=6\\part-00108-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2678-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp498a1aem\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=7\\part-00020-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2590-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpuhi_se7a\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=8\\part-00172-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2742-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpd7id7xon\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=9\\part-00076-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2646-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp3he0z_qe\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=10\\part-00090-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2660-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp1sa8wuxl\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=11\\part-00021-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2591-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp1e7uekhr\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=12\\part-00116-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2686-1.c000.snappy.parquet\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDlpepPickupDatetimelpepDropoffDatetimepassengerCounttripDistancepuLocationIddoLocationIdpickupLongitudepickupLatitudedropoffLongitude...paymentTypefareAmountextramtaTaximprovementSurchargetipAmounttollsAmountehailFeetotalAmounttripType
137986022016-01-14 06:39:002016-01-14 06:44:5511.23NoneNone-73.91182740.775372-73.899635...26.50.00.50.30.000.0NaN7.301.0
37754822016-01-01 06:22:012016-01-01 06:27:1450.91NoneNone-73.96204440.709797-73.946716...25.50.00.50.30.000.0NaN6.301.0
47397622016-01-08 20:55:492016-01-08 21:05:5063.42NoneNone-73.90482340.741776-73.878815...211.50.50.50.30.000.0NaN12.801.0
124668322016-01-15 08:27:412016-01-15 08:41:0513.99NoneNone-73.91148440.854698-73.881821...215.00.00.50.30.000.0NaN15.801.0
115226122016-01-09 04:35:212016-01-09 04:41:0210.98NoneNone-73.92177640.767071-73.933136...16.00.50.50.30.700.0NaN8.001.0
..................................................................
99827312016-12-24 22:03:252016-12-24 22:17:1615.3074235NaNNaNNaN...216.50.50.50.30.000.0NaN17.801.0
85720022016-12-03 20:33:532016-12-03 20:53:5114.8183258NaNNaNNaN...118.50.50.50.33.000.0NaN22.801.0
60776822016-12-18 16:17:542016-12-18 16:33:1332.029556NaNNaNNaN...211.50.00.50.30.000.0NaN12.301.0
7868722016-12-06 09:24:432016-12-06 09:41:0919.516611NaNNaNNaN...227.50.00.50.30.000.0NaN28.301.0
14167222016-12-14 16:12:342016-12-14 16:15:1110.51255256NaNNaNNaN...14.01.00.50.31.450.0NaN7.251.0
\n", + "

24000 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " vendorID lpepPickupDatetime lpepDropoffDatetime passengerCount \\\n", + "1379860 2 2016-01-14 06:39:00 2016-01-14 06:44:55 1 \n", + "377548 2 2016-01-01 06:22:01 2016-01-01 06:27:14 5 \n", + "473976 2 2016-01-08 20:55:49 2016-01-08 21:05:50 6 \n", + "1246683 2 2016-01-15 08:27:41 2016-01-15 08:41:05 1 \n", + "1152261 2 2016-01-09 04:35:21 2016-01-09 04:41:02 1 \n", + "... ... ... ... ... \n", + "998273 1 2016-12-24 22:03:25 2016-12-24 22:17:16 1 \n", + "857200 2 2016-12-03 20:33:53 2016-12-03 20:53:51 1 \n", + "607768 2 2016-12-18 16:17:54 2016-12-18 16:33:13 3 \n", + "78687 2 2016-12-06 09:24:43 2016-12-06 09:41:09 1 \n", + "141672 2 2016-12-14 16:12:34 2016-12-14 16:15:11 1 \n", + "\n", + " tripDistance puLocationId doLocationId pickupLongitude \\\n", + "1379860 1.23 None None -73.911827 \n", + "377548 0.91 None None -73.962044 \n", + "473976 3.42 None None -73.904823 \n", + "1246683 3.99 None None -73.911484 \n", + "1152261 0.98 None None -73.921776 \n", + "... ... ... ... ... \n", + "998273 5.30 74 235 NaN \n", + "857200 4.81 83 258 NaN \n", + "607768 2.02 95 56 NaN \n", + "78687 9.51 66 11 NaN \n", + "141672 0.51 255 256 NaN \n", + "\n", + " pickupLatitude dropoffLongitude ... paymentType fareAmount extra \\\n", + "1379860 40.775372 -73.899635 ... 2 6.5 0.0 \n", + "377548 40.709797 -73.946716 ... 2 5.5 0.0 \n", + "473976 40.741776 -73.878815 ... 2 11.5 0.5 \n", + "1246683 40.854698 -73.881821 ... 2 15.0 0.0 \n", + "1152261 40.767071 -73.933136 ... 1 6.0 0.5 \n", + "... ... ... ... ... ... ... \n", + "998273 NaN NaN ... 2 16.5 0.5 \n", + "857200 NaN NaN ... 1 18.5 0.5 \n", + "607768 NaN NaN ... 2 11.5 0.0 \n", + "78687 NaN NaN ... 2 27.5 0.0 \n", + "141672 NaN NaN ... 1 4.0 1.0 \n", + "\n", + " mtaTax improvementSurcharge tipAmount tollsAmount ehailFee \\\n", + "1379860 0.5 0.3 0.00 0.0 NaN \n", + "377548 0.5 0.3 0.00 0.0 NaN \n", + "473976 0.5 0.3 0.00 0.0 NaN \n", + "1246683 0.5 0.3 0.00 0.0 NaN \n", + "1152261 0.5 0.3 0.70 0.0 NaN \n", + "... ... ... ... ... ... \n", + "998273 0.5 0.3 0.00 0.0 NaN \n", + "857200 0.5 0.3 3.00 0.0 NaN \n", + "607768 0.5 0.3 0.00 0.0 NaN \n", + "78687 0.5 0.3 0.00 0.0 NaN \n", + "141672 0.5 0.3 1.45 0.0 NaN \n", + "\n", + " totalAmount tripType \n", + "1379860 7.30 1.0 \n", + "377548 6.30 1.0 \n", + "473976 12.80 1.0 \n", + "1246683 15.80 1.0 \n", + "1152261 8.00 1.0 \n", + "... ... ... \n", + "998273 17.80 1.0 \n", + "857200 22.80 1.0 \n", + "607768 12.30 1.0 \n", + "78687 28.30 1.0 \n", + "141672 7.25 1.0 \n", + "\n", + "[24000 rows x 23 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n", + "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n", + "\n", + "green_taxi_df = pd.concat([NycTlcGreen(start + relativedelta(months=x), end + relativedelta(months=x)) \\\n", + " .to_pandas_dataframe().sample(2000) for x in range(12)])\n", + "green_taxi_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that the initial data is loaded, define a function to create various time-based features from the pickup datetime field. This will create new fields for the month number, day of month, day of week, and hour of day. From those, we calculate the sin and cosine transformations to capture the cyclical nature of the variable which will allow the model to factor in time-based seasonality. This function also adds a static feature for the country code to join the holiday data. Use the apply() function on the dataframe to interatively apply this function to each row in the dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDlpepPickupDatetimelpepDropoffDatetimepassengerCounttripDistancepuLocationIddoLocationIdpickupLongitudepickupLatitudedropoffLongitude...tripTypemonth_numday_of_monthday_of_weekhour_of_daycountry_codehr_sinhr_cosdy_sindy_cos
137986022016-01-14 06:39:002016-01-14 06:44:5511.23NoneNone-73.91182740.775372-73.899635...1.011436US1.0000006.123234e-170.433884-0.900969
37754822016-01-01 06:22:012016-01-01 06:27:1450.91NoneNone-73.96204440.709797-73.946716...1.01146US1.0000006.123234e-17-0.433884-0.900969
47397622016-01-08 20:55:492016-01-08 21:05:5063.42NoneNone-73.90482340.741776-73.878815...1.018420US-0.8660255.000000e-01-0.433884-0.900969
124668322016-01-15 08:27:412016-01-15 08:41:0513.99NoneNone-73.91148440.854698-73.881821...1.011548US0.866025-5.000000e-01-0.433884-0.900969
115226122016-01-09 04:35:212016-01-09 04:41:0210.98NoneNone-73.92177640.767071-73.933136...1.01954US0.8660255.000000e-01-0.974928-0.222521
..................................................................
99827312016-12-24 22:03:252016-12-24 22:17:1615.3074235NaNNaNNaN...1.01224522US-0.5000008.660254e-01-0.974928-0.222521
85720022016-12-03 20:33:532016-12-03 20:53:5114.8183258NaNNaNNaN...1.0123520US-0.8660255.000000e-01-0.974928-0.222521
60776822016-12-18 16:17:542016-12-18 16:33:1332.029556NaNNaNNaN...1.01218616US-0.866025-5.000000e-01-0.7818310.623490
7868722016-12-06 09:24:432016-12-06 09:41:0919.516611NaNNaNNaN...1.012619US0.707107-7.071068e-010.7818310.623490
14167222016-12-14 16:12:342016-12-14 16:15:1110.51255256NaNNaNNaN...1.01214216US-0.866025-5.000000e-010.974928-0.222521
\n", + "

24000 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " vendorID lpepPickupDatetime lpepDropoffDatetime passengerCount \\\n", + "1379860 2 2016-01-14 06:39:00 2016-01-14 06:44:55 1 \n", + "377548 2 2016-01-01 06:22:01 2016-01-01 06:27:14 5 \n", + "473976 2 2016-01-08 20:55:49 2016-01-08 21:05:50 6 \n", + "1246683 2 2016-01-15 08:27:41 2016-01-15 08:41:05 1 \n", + "1152261 2 2016-01-09 04:35:21 2016-01-09 04:41:02 1 \n", + "... ... ... ... ... \n", + "998273 1 2016-12-24 22:03:25 2016-12-24 22:17:16 1 \n", + "857200 2 2016-12-03 20:33:53 2016-12-03 20:53:51 1 \n", + "607768 2 2016-12-18 16:17:54 2016-12-18 16:33:13 3 \n", + "78687 2 2016-12-06 09:24:43 2016-12-06 09:41:09 1 \n", + "141672 2 2016-12-14 16:12:34 2016-12-14 16:15:11 1 \n", + "\n", + " tripDistance puLocationId doLocationId pickupLongitude \\\n", + "1379860 1.23 None None -73.911827 \n", + "377548 0.91 None None -73.962044 \n", + "473976 3.42 None None -73.904823 \n", + "1246683 3.99 None None -73.911484 \n", + "1152261 0.98 None None -73.921776 \n", + "... ... ... ... ... \n", + "998273 5.30 74 235 NaN \n", + "857200 4.81 83 258 NaN \n", + "607768 2.02 95 56 NaN \n", + "78687 9.51 66 11 NaN \n", + "141672 0.51 255 256 NaN \n", + "\n", + " pickupLatitude dropoffLongitude ... tripType month_num \\\n", + "1379860 40.775372 -73.899635 ... 1.0 1 \n", + "377548 40.709797 -73.946716 ... 1.0 1 \n", + "473976 40.741776 -73.878815 ... 1.0 1 \n", + "1246683 40.854698 -73.881821 ... 1.0 1 \n", + "1152261 40.767071 -73.933136 ... 1.0 1 \n", + "... ... ... ... ... ... \n", + "998273 NaN NaN ... 1.0 12 \n", + "857200 NaN NaN ... 1.0 12 \n", + "607768 NaN NaN ... 1.0 12 \n", + "78687 NaN NaN ... 1.0 12 \n", + "141672 NaN NaN ... 1.0 12 \n", + "\n", + " day_of_month day_of_week hour_of_day country_code hr_sin \\\n", + "1379860 14 3 6 US 1.000000 \n", + "377548 1 4 6 US 1.000000 \n", + "473976 8 4 20 US -0.866025 \n", + "1246683 15 4 8 US 0.866025 \n", + "1152261 9 5 4 US 0.866025 \n", + "... ... ... ... ... ... \n", + "998273 24 5 22 US -0.500000 \n", + "857200 3 5 20 US -0.866025 \n", + "607768 18 6 16 US -0.866025 \n", + "78687 6 1 9 US 0.707107 \n", + "141672 14 2 16 US -0.866025 \n", + "\n", + " hr_cos dy_sin dy_cos \n", + "1379860 6.123234e-17 0.433884 -0.900969 \n", + "377548 6.123234e-17 -0.433884 -0.900969 \n", + "473976 5.000000e-01 -0.433884 -0.900969 \n", + "1246683 -5.000000e-01 -0.433884 -0.900969 \n", + "1152261 5.000000e-01 -0.974928 -0.222521 \n", + "... ... ... ... \n", + "998273 8.660254e-01 -0.974928 -0.222521 \n", + "857200 5.000000e-01 -0.974928 -0.222521 \n", + "607768 -5.000000e-01 -0.781831 0.623490 \n", + "78687 -7.071068e-01 0.781831 0.623490 \n", + "141672 -5.000000e-01 0.974928 -0.222521 \n", + "\n", + "[24000 rows x 32 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def build_time_features(vector):\n", + " pickup_datetime = vector[0]\n", + " month_num = pickup_datetime.month\n", + " day_of_month = pickup_datetime.day\n", + " day_of_week = pickup_datetime.weekday()\n", + " hour_of_day = pickup_datetime.hour\n", + " country_code = \"US\"\n", + " hr_sin = np.sin(hour_of_day*(2.*np.pi/24))\n", + " hr_cos = np.cos(hour_of_day*(2.*np.pi/24))\n", + " dy_sin = np.sin(day_of_week*(2.*np.pi/7))\n", + " dy_cos = np.cos(day_of_week*(2.*np.pi/7))\n", + " \n", + " return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos))\n", + "\n", + "green_taxi_df[[\"month_num\", \"day_of_month\",\"day_of_week\", \"hour_of_day\", \"country_code\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]] = green_taxi_df[[\"lpepPickupDatetime\"]].apply(build_time_features, axis=1)\n", + "green_taxi_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove some of the columns that you won't need for modeling or additional feature building. Rename the time field for pickup time, and additionally convert the time to midnight using `pandas.Series.dt.normalize`. This is done to all time features so that the datetime column can be later used as a key when joining datasets together at a daily level of granularity." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDlpepPickupDatetimepassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_numday_of_monthday_of_weekhour_of_daycountry_codehr_sinhr_cosdy_sindy_cosdatetime
137986022016-01-14 06:39:0011.23-73.91182740.775372-73.89963540.7683337.311436US1.0000006.123234e-170.433884-0.9009692016-01-14
37754822016-01-01 06:22:0150.91-73.96204440.709797-73.94671640.7069026.31146US1.0000006.123234e-17-0.433884-0.9009692016-01-01
47397622016-01-08 20:55:4963.42-73.90482340.741776-73.87881540.71762512.818420US-0.8660255.000000e-01-0.433884-0.9009692016-01-08
124668322016-01-15 08:27:4113.99-73.91148440.854698-73.88182140.88213015.811548US0.866025-5.000000e-01-0.433884-0.9009692016-01-15
115226122016-01-09 04:35:2110.98-73.92177640.767071-73.93313640.7745678.01954US0.8660255.000000e-01-0.974928-0.2225212016-01-09
\n", + "
" + ], + "text/plain": [ + " vendorID lpepPickupDatetime passengerCount tripDistance \\\n", + "1379860 2 2016-01-14 06:39:00 1 1.23 \n", + "377548 2 2016-01-01 06:22:01 5 0.91 \n", + "473976 2 2016-01-08 20:55:49 6 3.42 \n", + "1246683 2 2016-01-15 08:27:41 1 3.99 \n", + "1152261 2 2016-01-09 04:35:21 1 0.98 \n", + "\n", + " pickupLongitude pickupLatitude dropoffLongitude dropoffLatitude \\\n", + "1379860 -73.911827 40.775372 -73.899635 40.768333 \n", + "377548 -73.962044 40.709797 -73.946716 40.706902 \n", + "473976 -73.904823 40.741776 -73.878815 40.717625 \n", + "1246683 -73.911484 40.854698 -73.881821 40.882130 \n", + "1152261 -73.921776 40.767071 -73.933136 40.774567 \n", + "\n", + " totalAmount month_num day_of_month day_of_week hour_of_day \\\n", + "1379860 7.3 1 14 3 6 \n", + "377548 6.3 1 1 4 6 \n", + "473976 12.8 1 8 4 20 \n", + "1246683 15.8 1 15 4 8 \n", + "1152261 8.0 1 9 5 4 \n", + "\n", + " country_code hr_sin hr_cos dy_sin dy_cos datetime \n", + "1379860 US 1.000000 6.123234e-17 0.433884 -0.900969 2016-01-14 \n", + "377548 US 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n", + "473976 US -0.866025 5.000000e-01 -0.433884 -0.900969 2016-01-08 \n", + "1246683 US 0.866025 -5.000000e-01 -0.433884 -0.900969 2016-01-15 \n", + "1152261 US 0.866025 5.000000e-01 -0.974928 -0.222521 2016-01-09 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "columns_to_remove = [\"lpepDropoffDatetime\", \"puLocationId\", \"doLocationId\", \"extra\", \"mtaTax\",\n", + " \"improvementSurcharge\", \"tollsAmount\", \"ehailFee\", \"tripType\", \"rateCodeID\", \n", + " \"storeAndFwdFlag\", \"paymentType\", \"fareAmount\", \"tipAmount\"]\n", + "\n", + "green_taxi_df.drop(columns_to_remove, axis=1, inplace=True)\n", + "\n", + "green_taxi_df[\"datetime\"] = green_taxi_df[\"lpepPickupDatetime\"].dt.normalize()\n", + "green_taxi_df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Enrich with Holiday Data\n", + "\n", + "Now that the taxi data is downloaded and roughly prepared, add in holiday data as additional features. Holiday-specific features will assist model accuracy, as major holidays are times where taxi demand increases dramatically and supply becomes limited. The holiday dataset is relatively small, so fetch the full set by using the `PublicHolidays` class constructor with no parameters for filtering. Preview the data to check the format." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpya4i60qp\\https%3A\\%2Fazureopendatastorage.azurefd.net\\holidaydatacontainer\\Processed\\part-00000-tid-8468414522853579044-35925ba8-a227-4b80-9c89-17065e7bf1db-649-c000.snappy.parquet\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryOrRegionholidayNamenormalizeHolidayNameisPaidTimeOffcountryRegionCodedate
19375ArgentinaAño Nuevo [New Year's Day]Año Nuevo [New Year's Day]NoneAR2008-01-01
19376AustraliaNew Year's DayNew Year's DayNoneAU2008-01-01
19377AustriaNeujahrNeujahrNoneAT2008-01-01
19378BelarusНовый годНовый годNoneBY2008-01-01
19379BelgiumNieuwjaarsdagNieuwjaarsdagNoneBE2008-01-01
\n", + "
" + ], + "text/plain": [ + " countryOrRegion holidayName normalizeHolidayName \\\n", + "19375 Argentina Año Nuevo [New Year's Day] Año Nuevo [New Year's Day] \n", + "19376 Australia New Year's Day New Year's Day \n", + "19377 Austria Neujahr Neujahr \n", + "19378 Belarus Новый год Новый год \n", + "19379 Belgium Nieuwjaarsdag Nieuwjaarsdag \n", + "\n", + " isPaidTimeOff countryRegionCode date \n", + "19375 None AR 2008-01-01 \n", + "19376 None AU 2008-01-01 \n", + "19377 None AT 2008-01-01 \n", + "19378 None BY 2008-01-01 \n", + "19379 None BE 2008-01-01 " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from azureml.opendatasets import PublicHolidays\n", + "\n", + "# call default constructor to download full dataset\n", + "holidays_df = PublicHolidays().to_pandas_dataframe()\n", + "holidays_df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rename the `countryRegionCode` and `date` columns to match the respective field names from the taxi data, and also normalize the time so it can be used as a key. Next, join the holiday data with the taxi data by performing a left-join using the Pandas `merge()` function. This will preserve all records from `green_taxi_df`, but add in holiday data where it exists for the corresponding `datetime` and `country_code`, which in this case is always `\\\"US\\\"`. Preview the data to verify that they were merged correctly." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDlpepPickupDatetimepassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_num...day_of_weekhour_of_daycountry_codehr_sinhr_cosdy_sindy_cosdatetimenormalizeHolidayNameisPaidTimeOff
122016-01-01 06:22:0150.91-73.96204440.709797-73.94671640.7069026.301...46US1.0000006.123234e-17-0.433884-0.9009692016-01-01New Year's DayTrue
2522016-01-01 06:14:4312.44-73.99357640.681519-73.99959640.65593010.301...46US1.0000006.123234e-17-0.433884-0.9009692016-01-01New Year's DayTrue
2722016-01-01 16:06:3314.57-73.96250940.687862-73.98136140.73275822.251...416US-0.866025-5.000000e-01-0.433884-0.9009692016-01-01New Year's DayTrue
4422016-01-18 11:46:27116.10-73.92552240.827877-73.93498240.68127850.301...011US0.258819-9.659258e-010.0000001.0000002016-01-18Martin Luther King Jr. DayNone
4522016-01-01 10:41:3913.33-73.96289140.711971-73.91806040.73683212.801...410US0.500000-8.660254e-01-0.433884-0.9009692016-01-01New Year's DayTrue
..................................................................
2386822016-12-25 00:21:2312.36NaNNaNNaNNaN12.3012...60US0.0000001.000000e+00-0.7818310.6234902016-12-25Christmas DayTrue
2389222016-12-25 14:05:4811.05NaNNaNNaNNaN12.3012...614US-0.500000-8.660254e-01-0.7818310.6234902016-12-25Christmas DayTrue
2394212016-12-26 01:43:5710.80NaNNaNNaNNaN7.5512...01US0.2588199.659258e-010.0000001.0000002016-12-26Christmas DayTrue
2397822016-12-26 03:38:3311.55NaNNaNNaNNaN8.3012...03US0.7071077.071068e-010.0000001.0000002016-12-26Christmas DayTrue
2398522016-12-26 22:12:1813.77NaNNaNNaNNaN16.2512...022US-0.5000008.660254e-010.0000001.0000002016-12-26Christmas DayTrue
\n", + "

673 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " vendorID lpepPickupDatetime passengerCount tripDistance \\\n", + "1 2 2016-01-01 06:22:01 5 0.91 \n", + "25 2 2016-01-01 06:14:43 1 2.44 \n", + "27 2 2016-01-01 16:06:33 1 4.57 \n", + "44 2 2016-01-18 11:46:27 1 16.10 \n", + "45 2 2016-01-01 10:41:39 1 3.33 \n", + "... ... ... ... ... \n", + "23868 2 2016-12-25 00:21:23 1 2.36 \n", + "23892 2 2016-12-25 14:05:48 1 1.05 \n", + "23942 1 2016-12-26 01:43:57 1 0.80 \n", + "23978 2 2016-12-26 03:38:33 1 1.55 \n", + "23985 2 2016-12-26 22:12:18 1 3.77 \n", + "\n", + " pickupLongitude pickupLatitude dropoffLongitude dropoffLatitude \\\n", + "1 -73.962044 40.709797 -73.946716 40.706902 \n", + "25 -73.993576 40.681519 -73.999596 40.655930 \n", + "27 -73.962509 40.687862 -73.981361 40.732758 \n", + "44 -73.925522 40.827877 -73.934982 40.681278 \n", + "45 -73.962891 40.711971 -73.918060 40.736832 \n", + "... ... ... ... ... \n", + "23868 NaN NaN NaN NaN \n", + "23892 NaN NaN NaN NaN \n", + "23942 NaN NaN NaN NaN \n", + "23978 NaN NaN NaN NaN \n", + "23985 NaN NaN NaN NaN \n", + "\n", + " totalAmount month_num ... day_of_week hour_of_day country_code \\\n", + "1 6.30 1 ... 4 6 US \n", + "25 10.30 1 ... 4 6 US \n", + "27 22.25 1 ... 4 16 US \n", + "44 50.30 1 ... 0 11 US \n", + "45 12.80 1 ... 4 10 US \n", + "... ... ... ... ... ... ... \n", + "23868 12.30 12 ... 6 0 US \n", + "23892 12.30 12 ... 6 14 US \n", + "23942 7.55 12 ... 0 1 US \n", + "23978 8.30 12 ... 0 3 US \n", + "23985 16.25 12 ... 0 22 US \n", + "\n", + " hr_sin hr_cos dy_sin dy_cos datetime \\\n", + "1 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n", + "25 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n", + "27 -0.866025 -5.000000e-01 -0.433884 -0.900969 2016-01-01 \n", + "44 0.258819 -9.659258e-01 0.000000 1.000000 2016-01-18 \n", + "45 0.500000 -8.660254e-01 -0.433884 -0.900969 2016-01-01 \n", + "... ... ... ... ... ... \n", + "23868 0.000000 1.000000e+00 -0.781831 0.623490 2016-12-25 \n", + "23892 -0.500000 -8.660254e-01 -0.781831 0.623490 2016-12-25 \n", + "23942 0.258819 9.659258e-01 0.000000 1.000000 2016-12-26 \n", + "23978 0.707107 7.071068e-01 0.000000 1.000000 2016-12-26 \n", + "23985 -0.500000 8.660254e-01 0.000000 1.000000 2016-12-26 \n", + "\n", + " normalizeHolidayName isPaidTimeOff \n", + "1 New Year's Day True \n", + "25 New Year's Day True \n", + "27 New Year's Day True \n", + "44 Martin Luther King Jr. Day None \n", + "45 New Year's Day True \n", + "... ... ... \n", + "23868 Christmas Day True \n", + "23892 Christmas Day True \n", + "23942 Christmas Day True \n", + "23978 Christmas Day True \n", + "23985 Christmas Day True \n", + "\n", + "[673 rows x 21 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "holidays_df = holidays_df.rename(columns={\"countryRegionCode\": \"country_code\"})\n", + "holidays_df[\"datetime\"] = holidays_df[\"date\"].dt.normalize()\n", + "\n", + "holidays_df.drop([\"countryOrRegion\", \"holidayName\", \"date\"], axis=1, inplace=True)\n", + "\n", + "taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how=\"left\", on=[\"datetime\", \"country_code\"])\n", + "taxi_holidays_df[taxi_holidays_df[\"normalizeHolidayName\"].notnull()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Enrich with weather data\n", + "\n", + "Now NOAA surface weather data can be appended to the taxi and holiday data. Use a similar approach to fetch the weather data by downloading one month at a time iteratively. Additionally, specify the `cols` parameter with an array of strings to filter the columns to download. This is a very large dataset containing weather surface data from all over the world, so before appending each month, filter the lat/long fields to near NYC using the `query()` function on the dataframe. This will ensure the `weather_df` doesn't get too large." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-12.c000.snappy.parquet\n" + ] + } + ], + "source": [ + "from azureml.opendatasets import NoaaIsdWeather\n", + "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n", + "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n", + "\n", + "weather_df = pd.concat([NoaaIsdWeather(cols=[\"temperature\", \"precipTime\", \"precipDepth\"], start_date=start + relativedelta(months=x), end_date=end + relativedelta(months=x))\\\n", + " .to_pandas_dataframe().query(\"latitude>=40.53 and latitude<=40.88 and longitude>=-74.09 and longitude<=-73.72 and temperature==temperature\") for x in range(12)])" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
wbanlatitudetemperatureusafdatetimelongitudeprecipDepthprecipTime
2046471473240.7832.87250302016-01-02 03:00:00-73.867NaNNaN
2046701473240.779-4.47250302016-01-22 13:51:00-73.8800.01.0
2046941473240.7795.07250302016-01-08 02:51:00-73.8800.01.0
2047011473240.779-1.17250302016-01-04 15:51:00-73.8800.01.0
2047151473240.7794.47250302016-01-01 21:51:00-73.8800.01.0
...........................
12484719472840.7894.47250532016-12-23 13:51:00-73.9670.01.0
12485559472840.7895.07250532016-12-12 13:51:00-73.9670.01.0
12485809472840.7893.97250532016-12-18 07:01:00-73.967NaNNaN
12485979472840.7897.87250532016-12-25 00:51:00-73.9670.01.0
12486009472840.789-2.87250532016-12-17 11:10:00-73.9675.01.0
\n", + "

55683 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " wban latitude temperature usaf datetime longitude \\\n", + "204647 14732 40.783 2.8 725030 2016-01-02 03:00:00 -73.867 \n", + "204670 14732 40.779 -4.4 725030 2016-01-22 13:51:00 -73.880 \n", + "204694 14732 40.779 5.0 725030 2016-01-08 02:51:00 -73.880 \n", + "204701 14732 40.779 -1.1 725030 2016-01-04 15:51:00 -73.880 \n", + "204715 14732 40.779 4.4 725030 2016-01-01 21:51:00 -73.880 \n", + "... ... ... ... ... ... ... \n", + "1248471 94728 40.789 4.4 725053 2016-12-23 13:51:00 -73.967 \n", + "1248555 94728 40.789 5.0 725053 2016-12-12 13:51:00 -73.967 \n", + "1248580 94728 40.789 3.9 725053 2016-12-18 07:01:00 -73.967 \n", + "1248597 94728 40.789 7.8 725053 2016-12-25 00:51:00 -73.967 \n", + "1248600 94728 40.789 -2.8 725053 2016-12-17 11:10:00 -73.967 \n", + "\n", + " precipDepth precipTime \n", + "204647 NaN NaN \n", + "204670 0.0 1.0 \n", + "204694 0.0 1.0 \n", + "204701 0.0 1.0 \n", + "204715 0.0 1.0 \n", + "... ... ... \n", + "1248471 0.0 1.0 \n", + "1248555 0.0 1.0 \n", + "1248580 NaN NaN \n", + "1248597 0.0 1.0 \n", + "1248600 5.0 1.0 \n", + "\n", + "[55683 rows x 8 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weather_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again call `pandas.Series.dt.normalize` on the `datetime` field in the weather data so it matches the time key in `taxi_holidays_df`.\n", + "\n", + "\n", + "Next group the weather data to have daily aggregated weather values. Define a dict `aggregations` to define how to aggregate each field at a daily level. For`temperature` take the mean and for `precipTime` and `precipDepth` take the daily maximum. Use the `groupby()` function along with the aggregations to group the data. Preview the data to ensure there is one record per day." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
precipTimetemperatureprecipDepth
datetime
2016-01-011.05.1973450.0
2016-01-021.02.5678570.0
2016-01-031.03.8464290.0
2016-01-041.00.1238940.0
2016-01-056.0-7.2062500.0
2016-01-066.0-0.8963960.0
2016-01-076.03.1806450.0
2016-01-081.04.3840910.0
2016-01-096.06.7102743.0
2016-01-1024.010.943655254.0
\n", + "
" + ], + "text/plain": [ + " precipTime temperature precipDepth\n", + "datetime \n", + "2016-01-01 1.0 5.197345 0.0\n", + "2016-01-02 1.0 2.567857 0.0\n", + "2016-01-03 1.0 3.846429 0.0\n", + "2016-01-04 1.0 0.123894 0.0\n", + "2016-01-05 6.0 -7.206250 0.0\n", + "2016-01-06 6.0 -0.896396 0.0\n", + "2016-01-07 6.0 3.180645 0.0\n", + "2016-01-08 1.0 4.384091 0.0\n", + "2016-01-09 6.0 6.710274 3.0\n", + "2016-01-10 24.0 10.943655 254.0" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weather_df[\"datetime\"] = weather_df[\"datetime\"].dt.normalize()\n", + "\n", + "# group by datetime\n", + "aggregations = {\"precipTime\": \"max\", \"temperature\": \"mean\", \"precipDepth\": \"max\"}\n", + "weather_df_grouped = weather_df.groupby(\"datetime\").agg(aggregations)\n", + "weather_df_grouped.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: The examples in this tutorial merge data using Pandas functions and custom aggregations, but the Open Datasets SDK has classes designed to easily merge and enrich data sets. See the [notebook](https://github.com/Azure/OpenDatasetsNotebooks/blob/master/tutorials/data-join/04-nyc-taxi-join-weather-in-pandas.ipynb) for code examples of these design patterns." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cleanse data\n", + "\n", + "Merge the existing taxi and holiday data with the new weather data. This time `datetime` is the only key, and again perform a left-join of the data. Run the `describe()` function on the new dataframe to see summary statistics for each field." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDpassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_numday_of_monthday_of_weekhour_of_dayhr_sinhr_cosdy_sindy_cosprecipTimetemperatureprecipDepth
count24000.00000024000.00000024000.00000012000.00000012000.00000012000.00000012000.00000024000.00000024000.00000024000.00000024000.00000024000.00000024000.0000002.400000e+0424000.00000024000.00000024000.00000024000.00000024000.000000
mean1.7896671.3552922.830398-73.81439340.678791-73.83701940.69072914.6682516.50000015.0687503.24779213.582875-0.239687-1.510585e-02-0.079292-0.05963013.31866713.8782721037.956292
std0.4075541.0200183.1183023.0163851.6631522.6986091.48803211.7385323.4521248.4775551.9512096.7083720.6675287.048175e-010.7144570.69264010.3331629.4844432788.844868
min1.0000000.0000000.000000-74.1648250.000000-75.1864400.000000-200.0000001.0000001.0000000.0000000.000000-1.000000-1.000000e+00-0.974928-0.9009691.000000-13.3794640.000000
25%2.0000001.0000001.040000-73.96137040.693539-73.96751440.6951287.8800003.7500008.0000002.0000009.000000-0.866025-7.071068e-01-0.781831-0.9009696.0000006.6207730.000000
50%2.0000001.0000001.840000-73.94713240.745928-73.94586940.74591411.3000006.50000015.0000003.00000015.000000-0.500000-1.836970e-160.000000-0.2225216.00000013.10832310.000000
75%2.0000001.0000003.500000-73.91963840.802049-73.91305940.79107617.7500009.25000022.0000005.00000019.0000000.2588197.071068e-010.7818310.62349024.00000022.944737127.000000
max2.0000007.000000106.6800000.00000041.0810470.00000041.081055450.00000012.00000030.0000006.00000023.0000001.0000001.000000e+000.9749281.00000024.00000031.3036659999.000000
\n", + "
" + ], + "text/plain": [ + " vendorID passengerCount tripDistance pickupLongitude \\\n", + "count 24000.000000 24000.000000 24000.000000 12000.000000 \n", + "mean 1.789667 1.355292 2.830398 -73.814393 \n", + "std 0.407554 1.020018 3.118302 3.016385 \n", + "min 1.000000 0.000000 0.000000 -74.164825 \n", + "25% 2.000000 1.000000 1.040000 -73.961370 \n", + "50% 2.000000 1.000000 1.840000 -73.947132 \n", + "75% 2.000000 1.000000 3.500000 -73.919638 \n", + "max 2.000000 7.000000 106.680000 0.000000 \n", + "\n", + " pickupLatitude dropoffLongitude dropoffLatitude totalAmount \\\n", + "count 12000.000000 12000.000000 12000.000000 24000.000000 \n", + "mean 40.678791 -73.837019 40.690729 14.668251 \n", + "std 1.663152 2.698609 1.488032 11.738532 \n", + "min 0.000000 -75.186440 0.000000 -200.000000 \n", + "25% 40.693539 -73.967514 40.695128 7.880000 \n", + "50% 40.745928 -73.945869 40.745914 11.300000 \n", + "75% 40.802049 -73.913059 40.791076 17.750000 \n", + "max 41.081047 0.000000 41.081055 450.000000 \n", + "\n", + " month_num day_of_month day_of_week hour_of_day hr_sin \\\n", + "count 24000.000000 24000.000000 24000.000000 24000.000000 24000.000000 \n", + "mean 6.500000 15.068750 3.247792 13.582875 -0.239687 \n", + "std 3.452124 8.477555 1.951209 6.708372 0.667528 \n", + "min 1.000000 1.000000 0.000000 0.000000 -1.000000 \n", + "25% 3.750000 8.000000 2.000000 9.000000 -0.866025 \n", + "50% 6.500000 15.000000 3.000000 15.000000 -0.500000 \n", + "75% 9.250000 22.000000 5.000000 19.000000 0.258819 \n", + "max 12.000000 30.000000 6.000000 23.000000 1.000000 \n", + "\n", + " hr_cos dy_sin dy_cos precipTime temperature \\\n", + "count 2.400000e+04 24000.000000 24000.000000 24000.000000 24000.000000 \n", + "mean -1.510585e-02 -0.079292 -0.059630 13.318667 13.878272 \n", + "std 7.048175e-01 0.714457 0.692640 10.333162 9.484443 \n", + "min -1.000000e+00 -0.974928 -0.900969 1.000000 -13.379464 \n", + "25% -7.071068e-01 -0.781831 -0.900969 6.000000 6.620773 \n", + "50% -1.836970e-16 0.000000 -0.222521 6.000000 13.108323 \n", + "75% 7.071068e-01 0.781831 0.623490 24.000000 22.944737 \n", + "max 1.000000e+00 0.974928 1.000000 24.000000 31.303665 \n", + "\n", + " precipDepth \n", + "count 24000.000000 \n", + "mean 1037.956292 \n", + "std 2788.844868 \n", + "min 0.000000 \n", + "25% 0.000000 \n", + "50% 10.000000 \n", + "75% 127.000000 \n", + "max 9999.000000 " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how=\"left\", on=[\"datetime\"])\n", + "taxi_holidays_weather_df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the summary statistics, you see that there are several fields that have outliers or values that will reduce model accuracy. First filter the lat/long fields to be within the same bounds you used for filtering weather data. The `tripDistance` field has some bad data, because the minimum value is negative. The `passengerCount` field has bad data as well, with the max value being 210 passengers. Lastly, the `totalAmount` field has negative values, which don't make sense in the context of our model.\n", + "\n", + "Filter out these anomolies using query functions, and then remove the last few columns unnecesary for training.\n", + "\n", + "Note: since a random sample of 2000 was taken for each month of the taxi data, the statistics may vary each time this is ran." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "final_df = taxi_holidays_weather_df.query(\"pickupLatitude>=40.53 and pickupLatitude<=40.88 and \\\n", + " pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \\\n", + " tripDistance>0 and tripDistance<75 and \\\n", + " passengerCount>0 and passengerCount<100 and \\\n", + " totalAmount>0\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Call `describe()` again on the data to ensure cleansing worked as expected. The final data is prepared and cleansed, consisting of taxi, holiday, and weather data, and is ready to use for machine learning model training." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDpassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_numday_of_monthday_of_weekhour_of_dayhr_sinhr_cosdy_sindy_cosprecipTimetemperatureprecipDepth
count11763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.0000001.176300e+0411763.00000011763.00000011763.00000011763.00000011763.000000
mean1.7901901.3692942.841407-73.93791140.746224-73.91090140.73081814.5579173.50131814.9292703.25231713.538553-0.236544-2.265927e-03-0.070226-0.05905911.99396410.288261192.179546
std0.4071911.0416342.8298640.0411210.0568181.3641140.7534689.9891651.7073508.4757931.9481276.7780120.6688127.048492e-010.7188710.68912210.1147758.5300111223.101074
min1.0000001.0000000.010000-74.03519440.572906-74.1830290.0000000.0100001.0000001.0000000.0000000.000000-1.000000-1.000000e+00-0.974928-0.9009691.000000-13.3794640.000000
25%2.0000001.0000001.090000-73.96160140.693594-73.96779340.6954408.1600002.0000008.0000002.0000009.000000-0.866025-7.071068e-01-0.781831-0.9009691.0000003.5045800.000000
50%2.0000001.0000001.900000-73.94751740.745842-73.94624340.74578911.3000004.00000015.0000003.00000015.000000-0.500000-1.836970e-160.000000-0.2225216.00000010.4682763.000000
75%2.0000001.0000003.530000-73.92050940.801752-73.91380740.78994217.3800005.00000022.0000005.00000019.0000000.2588197.071068e-010.7818310.62349024.00000016.96692341.000000
max2.0000006.00000038.850000-73.73889940.8799820.00000041.073185123.8000006.00000030.0000006.00000023.0000001.0000001.000000e+000.9749281.00000024.00000026.5241079999.000000
\n", + "
" + ], + "text/plain": [ + " vendorID passengerCount tripDistance pickupLongitude \\\n", + "count 11763.000000 11763.000000 11763.000000 11763.000000 \n", + "mean 1.790190 1.369294 2.841407 -73.937911 \n", + "std 0.407191 1.041634 2.829864 0.041121 \n", + "min 1.000000 1.000000 0.010000 -74.035194 \n", + "25% 2.000000 1.000000 1.090000 -73.961601 \n", + "50% 2.000000 1.000000 1.900000 -73.947517 \n", + "75% 2.000000 1.000000 3.530000 -73.920509 \n", + "max 2.000000 6.000000 38.850000 -73.738899 \n", + "\n", + " pickupLatitude dropoffLongitude dropoffLatitude totalAmount \\\n", + "count 11763.000000 11763.000000 11763.000000 11763.000000 \n", + "mean 40.746224 -73.910901 40.730818 14.557917 \n", + "std 0.056818 1.364114 0.753468 9.989165 \n", + "min 40.572906 -74.183029 0.000000 0.010000 \n", + "25% 40.693594 -73.967793 40.695440 8.160000 \n", + "50% 40.745842 -73.946243 40.745789 11.300000 \n", + "75% 40.801752 -73.913807 40.789942 17.380000 \n", + "max 40.879982 0.000000 41.073185 123.800000 \n", + "\n", + " month_num day_of_month day_of_week hour_of_day hr_sin \\\n", + "count 11763.000000 11763.000000 11763.000000 11763.000000 11763.000000 \n", + "mean 3.501318 14.929270 3.252317 13.538553 -0.236544 \n", + "std 1.707350 8.475793 1.948127 6.778012 0.668812 \n", + "min 1.000000 1.000000 0.000000 0.000000 -1.000000 \n", + "25% 2.000000 8.000000 2.000000 9.000000 -0.866025 \n", + "50% 4.000000 15.000000 3.000000 15.000000 -0.500000 \n", + "75% 5.000000 22.000000 5.000000 19.000000 0.258819 \n", + "max 6.000000 30.000000 6.000000 23.000000 1.000000 \n", + "\n", + " hr_cos dy_sin dy_cos precipTime temperature \\\n", + "count 1.176300e+04 11763.000000 11763.000000 11763.000000 11763.000000 \n", + "mean -2.265927e-03 -0.070226 -0.059059 11.993964 10.288261 \n", + "std 7.048492e-01 0.718871 0.689122 10.114775 8.530011 \n", + "min -1.000000e+00 -0.974928 -0.900969 1.000000 -13.379464 \n", + "25% -7.071068e-01 -0.781831 -0.900969 1.000000 3.504580 \n", + "50% -1.836970e-16 0.000000 -0.222521 6.000000 10.468276 \n", + "75% 7.071068e-01 0.781831 0.623490 24.000000 16.966923 \n", + "max 1.000000e+00 0.974928 1.000000 24.000000 26.524107 \n", + "\n", + " precipDepth \n", + "count 11763.000000 \n", + "mean 192.179546 \n", + "std 1223.101074 \n", + "min 0.000000 \n", + "25% 0.000000 \n", + "50% 3.000000 \n", + "75% 41.000000 \n", + "max 9999.000000 " + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train a model\n", + "\n", + "The data is ready to train a machine learning model." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.linear_model import RidgeCV\n", + "from sklearn.linear_model import Ridge\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training Function\n", + "\n", + "Define a function that can be used to create a model pipeline that can be trained and then used for scoring. This pipeline has 2 steps: preprocessing and model training.\n", + "\n", + "Preprocessing Stages:\n", + "The preprocessing step of the pipeline also has 2 stages, one for numerical features and one for categorical features.\n", + "For the numerical features, let's fill in any blanks with 0's. While the training data may not have any nulls in the these fields, future data that is scored may and this step will take care of those for us. Optionally, a scaler transformation could be added in this step as well. Similarly for the categorical variables, let's have the null values filled with \"MISSING\". Additionally to the categorical variables, these will need to be one hot encoded, so we will include that step in our pipeline.\n", + "\n", + "Model Training Stage:\n", + "An input parameter will determine which type of model of train. Let's test out a linear regression and random forest model to start. \n", + "\n", + "The two steps are put together into the pipeline which is what the function is returning." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "def createClassModel(algo_name, catg, nums):\n", + " numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])\n", + " \n", + " categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=\"MISSING\")), ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n", + " \n", + " preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)])\n", + " \n", + " if algo_name == 'linear_regression':\n", + " model=Ridge(alpha=100)\n", + " elif algo_name == 'random_forest':\n", + " model = RandomForestRegressor()\n", + " else:\n", + " pass\n", + " ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), (\"model\", model)])\n", + " return ModelPipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define the arguments that will be passed to the function. `catg_cols` is a list of the categorical variables that will be transformed in our processing step. `num_cols` is a list of the numerical variables that will be transformed in our processing step. Let's define the target column as `label` so it can be used in future steps as well." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "catg_cols = [\"vendorID\", \"month_num\", \"day_of_month\", \"normalizeHolidayName\", \"isPaidTimeOff\"]\n", + "num_cols = [\"passengerCount\", \"tripDistance\", \"precipTime\", \"temperature\", \"precipDepth\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]\n", + "label = [\"totalAmount\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The training is ready to begin, but first, let's make sure that the categorical variables are strings in our dataframe to ensure no errors in our pipeline. \n", + "\n", + "Next, the data is split into training and test sets by using the `train_test_split()` function in the `scikit-learn` library. The `test_size` parameter determines the percentage of data to allocate to testing. The `random_state` parameter sets a seed to the random number generator, so that your train-test splits are deterministic.\n", + "\n", + "The training will happen in the for loop so that both algorithms can be tested. The createClassModel funtion is called to retreive the pipeline that can then be trained using the training dataset. \n", + "\n", + "Once trained, the test dataset is then ran through the model to test the model's performance. Using various functions from sklearn.metrics, the R2 score, MAPE, and RMSE can be used to measure model performance." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "linear_regression\n", + "R2: 0.8034971051723139\n", + "MAPE: 0.15888983234876766\n", + "RMSE: 4.606544019524053\n", + "\n", + "random_forest\n", + "R2: 0.8073017231520601\n", + "MAPE: 0.14715914748857337\n", + "RMSE: 4.5617309259357475\n", + "\n" + ] + } + ], + "source": [ + "# make sure categorical columns are strings\n", + "final_df[catg_cols] = final_df[catg_cols].astype(\"str\")\n", + "\n", + "# split data\n", + "X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222)\n", + "\n", + "# test 2 algorithms\n", + "for algorithmname in [\"linear_regression\", 'random_forest']:\n", + " fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline\n", + " fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine\n", + "\n", + " y_pred = fitPipeline.predict(X_test) # score with fitted pipeline\n", + "\n", + " # Evaluate\n", + " r2 = r2_score(y_test, y_pred)\n", + " mape = mean_absolute_percentage_error(y_test, y_pred)\n", + " rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n", + "\n", + " print(algorithmname)\n", + " print(\"R2:\", r2)\n", + " print(\"MAPE:\", mape)\n", + " print(\"RMSE:\", rmse)\n", + " print()" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "74e9702761b8f12846716a18132904990016d49f378e22e0e13a0e91318de754" + }, + "kernelspec": { + "display_name": "Python 3.8.12 ('mlopsenv')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/workshop/notebooks/.ipynb_aml_checkpoints/taxi-tutorial-checkpoint2023-1-15-23-0-36Z.ipynb b/src/workshop/notebooks/.ipynb_aml_checkpoints/taxi-tutorial-checkpoint2023-1-15-23-0-36Z.ipynb new file mode 100644 index 00000000..b519258e --- /dev/null +++ b/src/workshop/notebooks/.ipynb_aml_checkpoints/taxi-tutorial-checkpoint2023-1-15-23-0-36Z.ipynb @@ -0,0 +1,2898 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial: Build a regression model with Open Datasets\n", + "\n", + "In this tutorial, you leverage the convenience of Azure Open Datasets to create a regression model to predict NYC taxi fare prices. Easily download publicly available taxi, holiday and weather data to create a dataset that can train a regression model using sklearn." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.opendatasets import NycTlcGreen\n", + "import pandas as pd\n", + "import numpy as np\n", + "from datetime import datetime\n", + "from dateutil.relativedelta import relativedelta\n", + "\n", + "pd.options.mode.chained_assignment = None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download Data\n", + "Begin by downloading the NYC Taxi dataset from Azure Open Datasets. In non-Spark environments, Open Datasets only allows one month of data at a time with certain classes to avoid MemoryError with large datasets. To download 1 year of taxi data, we will fetch 2000 random samples from each month.\n", + "\n", + "Note: Open Datasets has mirroring classes for working in Spark where data size and memory are not a concern." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpviwf6gni\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=1\\part-00119-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2689-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp6e1co7l5\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=2\\part-00060-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2630-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpd5lgxojh\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=3\\part-00196-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2766-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpela340gr\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=4\\part-00121-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2691-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpe79pzv2_\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=5\\part-00044-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2614-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpyxyv_8h4\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=6\\part-00108-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2678-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp498a1aem\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=7\\part-00020-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2590-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpuhi_se7a\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=8\\part-00172-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2742-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpd7id7xon\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=9\\part-00076-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2646-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp3he0z_qe\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=10\\part-00090-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2660-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp1sa8wuxl\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=11\\part-00021-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2591-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp1e7uekhr\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=12\\part-00116-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2686-1.c000.snappy.parquet\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDlpepPickupDatetimelpepDropoffDatetimepassengerCounttripDistancepuLocationIddoLocationIdpickupLongitudepickupLatitudedropoffLongitude...paymentTypefareAmountextramtaTaximprovementSurchargetipAmounttollsAmountehailFeetotalAmounttripType
137986022016-01-14 06:39:002016-01-14 06:44:5511.23NoneNone-73.91182740.775372-73.899635...26.50.00.50.30.000.0NaN7.301.0
37754822016-01-01 06:22:012016-01-01 06:27:1450.91NoneNone-73.96204440.709797-73.946716...25.50.00.50.30.000.0NaN6.301.0
47397622016-01-08 20:55:492016-01-08 21:05:5063.42NoneNone-73.90482340.741776-73.878815...211.50.50.50.30.000.0NaN12.801.0
124668322016-01-15 08:27:412016-01-15 08:41:0513.99NoneNone-73.91148440.854698-73.881821...215.00.00.50.30.000.0NaN15.801.0
115226122016-01-09 04:35:212016-01-09 04:41:0210.98NoneNone-73.92177640.767071-73.933136...16.00.50.50.30.700.0NaN8.001.0
..................................................................
99827312016-12-24 22:03:252016-12-24 22:17:1615.3074235NaNNaNNaN...216.50.50.50.30.000.0NaN17.801.0
85720022016-12-03 20:33:532016-12-03 20:53:5114.8183258NaNNaNNaN...118.50.50.50.33.000.0NaN22.801.0
60776822016-12-18 16:17:542016-12-18 16:33:1332.029556NaNNaNNaN...211.50.00.50.30.000.0NaN12.301.0
7868722016-12-06 09:24:432016-12-06 09:41:0919.516611NaNNaNNaN...227.50.00.50.30.000.0NaN28.301.0
14167222016-12-14 16:12:342016-12-14 16:15:1110.51255256NaNNaNNaN...14.01.00.50.31.450.0NaN7.251.0
\n", + "

24000 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " vendorID lpepPickupDatetime lpepDropoffDatetime passengerCount \\\n", + "1379860 2 2016-01-14 06:39:00 2016-01-14 06:44:55 1 \n", + "377548 2 2016-01-01 06:22:01 2016-01-01 06:27:14 5 \n", + "473976 2 2016-01-08 20:55:49 2016-01-08 21:05:50 6 \n", + "1246683 2 2016-01-15 08:27:41 2016-01-15 08:41:05 1 \n", + "1152261 2 2016-01-09 04:35:21 2016-01-09 04:41:02 1 \n", + "... ... ... ... ... \n", + "998273 1 2016-12-24 22:03:25 2016-12-24 22:17:16 1 \n", + "857200 2 2016-12-03 20:33:53 2016-12-03 20:53:51 1 \n", + "607768 2 2016-12-18 16:17:54 2016-12-18 16:33:13 3 \n", + "78687 2 2016-12-06 09:24:43 2016-12-06 09:41:09 1 \n", + "141672 2 2016-12-14 16:12:34 2016-12-14 16:15:11 1 \n", + "\n", + " tripDistance puLocationId doLocationId pickupLongitude \\\n", + "1379860 1.23 None None -73.911827 \n", + "377548 0.91 None None -73.962044 \n", + "473976 3.42 None None -73.904823 \n", + "1246683 3.99 None None -73.911484 \n", + "1152261 0.98 None None -73.921776 \n", + "... ... ... ... ... \n", + "998273 5.30 74 235 NaN \n", + "857200 4.81 83 258 NaN \n", + "607768 2.02 95 56 NaN \n", + "78687 9.51 66 11 NaN \n", + "141672 0.51 255 256 NaN \n", + "\n", + " pickupLatitude dropoffLongitude ... paymentType fareAmount extra \\\n", + "1379860 40.775372 -73.899635 ... 2 6.5 0.0 \n", + "377548 40.709797 -73.946716 ... 2 5.5 0.0 \n", + "473976 40.741776 -73.878815 ... 2 11.5 0.5 \n", + "1246683 40.854698 -73.881821 ... 2 15.0 0.0 \n", + "1152261 40.767071 -73.933136 ... 1 6.0 0.5 \n", + "... ... ... ... ... ... ... \n", + "998273 NaN NaN ... 2 16.5 0.5 \n", + "857200 NaN NaN ... 1 18.5 0.5 \n", + "607768 NaN NaN ... 2 11.5 0.0 \n", + "78687 NaN NaN ... 2 27.5 0.0 \n", + "141672 NaN NaN ... 1 4.0 1.0 \n", + "\n", + " mtaTax improvementSurcharge tipAmount tollsAmount ehailFee \\\n", + "1379860 0.5 0.3 0.00 0.0 NaN \n", + "377548 0.5 0.3 0.00 0.0 NaN \n", + "473976 0.5 0.3 0.00 0.0 NaN \n", + "1246683 0.5 0.3 0.00 0.0 NaN \n", + "1152261 0.5 0.3 0.70 0.0 NaN \n", + "... ... ... ... ... ... \n", + "998273 0.5 0.3 0.00 0.0 NaN \n", + "857200 0.5 0.3 3.00 0.0 NaN \n", + "607768 0.5 0.3 0.00 0.0 NaN \n", + "78687 0.5 0.3 0.00 0.0 NaN \n", + "141672 0.5 0.3 1.45 0.0 NaN \n", + "\n", + " totalAmount tripType \n", + "1379860 7.30 1.0 \n", + "377548 6.30 1.0 \n", + "473976 12.80 1.0 \n", + "1246683 15.80 1.0 \n", + "1152261 8.00 1.0 \n", + "... ... ... \n", + "998273 17.80 1.0 \n", + "857200 22.80 1.0 \n", + "607768 12.30 1.0 \n", + "78687 28.30 1.0 \n", + "141672 7.25 1.0 \n", + "\n", + "[24000 rows x 23 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n", + "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n", + "\n", + "green_taxi_df = pd.concat([NycTlcGreen(start + relativedelta(months=x), end + relativedelta(months=x)) \\\n", + " .to_pandas_dataframe().sample(2000) for x in range(12)])\n", + "green_taxi_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that the initial data is loaded, define a function to create various time-based features from the pickup datetime field. This will create new fields for the month number, day of month, day of week, and hour of day. From those, we calculate the sin and cosine transformations to capture the cyclical nature of the variable which will allow the model to factor in time-based seasonality. This function also adds a static feature for the country code to join the holiday data. Use the apply() function on the dataframe to interatively apply this function to each row in the dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDlpepPickupDatetimelpepDropoffDatetimepassengerCounttripDistancepuLocationIddoLocationIdpickupLongitudepickupLatitudedropoffLongitude...tripTypemonth_numday_of_monthday_of_weekhour_of_daycountry_codehr_sinhr_cosdy_sindy_cos
137986022016-01-14 06:39:002016-01-14 06:44:5511.23NoneNone-73.91182740.775372-73.899635...1.011436US1.0000006.123234e-170.433884-0.900969
37754822016-01-01 06:22:012016-01-01 06:27:1450.91NoneNone-73.96204440.709797-73.946716...1.01146US1.0000006.123234e-17-0.433884-0.900969
47397622016-01-08 20:55:492016-01-08 21:05:5063.42NoneNone-73.90482340.741776-73.878815...1.018420US-0.8660255.000000e-01-0.433884-0.900969
124668322016-01-15 08:27:412016-01-15 08:41:0513.99NoneNone-73.91148440.854698-73.881821...1.011548US0.866025-5.000000e-01-0.433884-0.900969
115226122016-01-09 04:35:212016-01-09 04:41:0210.98NoneNone-73.92177640.767071-73.933136...1.01954US0.8660255.000000e-01-0.974928-0.222521
..................................................................
99827312016-12-24 22:03:252016-12-24 22:17:1615.3074235NaNNaNNaN...1.01224522US-0.5000008.660254e-01-0.974928-0.222521
85720022016-12-03 20:33:532016-12-03 20:53:5114.8183258NaNNaNNaN...1.0123520US-0.8660255.000000e-01-0.974928-0.222521
60776822016-12-18 16:17:542016-12-18 16:33:1332.029556NaNNaNNaN...1.01218616US-0.866025-5.000000e-01-0.7818310.623490
7868722016-12-06 09:24:432016-12-06 09:41:0919.516611NaNNaNNaN...1.012619US0.707107-7.071068e-010.7818310.623490
14167222016-12-14 16:12:342016-12-14 16:15:1110.51255256NaNNaNNaN...1.01214216US-0.866025-5.000000e-010.974928-0.222521
\n", + "

24000 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " vendorID lpepPickupDatetime lpepDropoffDatetime passengerCount \\\n", + "1379860 2 2016-01-14 06:39:00 2016-01-14 06:44:55 1 \n", + "377548 2 2016-01-01 06:22:01 2016-01-01 06:27:14 5 \n", + "473976 2 2016-01-08 20:55:49 2016-01-08 21:05:50 6 \n", + "1246683 2 2016-01-15 08:27:41 2016-01-15 08:41:05 1 \n", + "1152261 2 2016-01-09 04:35:21 2016-01-09 04:41:02 1 \n", + "... ... ... ... ... \n", + "998273 1 2016-12-24 22:03:25 2016-12-24 22:17:16 1 \n", + "857200 2 2016-12-03 20:33:53 2016-12-03 20:53:51 1 \n", + "607768 2 2016-12-18 16:17:54 2016-12-18 16:33:13 3 \n", + "78687 2 2016-12-06 09:24:43 2016-12-06 09:41:09 1 \n", + "141672 2 2016-12-14 16:12:34 2016-12-14 16:15:11 1 \n", + "\n", + " tripDistance puLocationId doLocationId pickupLongitude \\\n", + "1379860 1.23 None None -73.911827 \n", + "377548 0.91 None None -73.962044 \n", + "473976 3.42 None None -73.904823 \n", + "1246683 3.99 None None -73.911484 \n", + "1152261 0.98 None None -73.921776 \n", + "... ... ... ... ... \n", + "998273 5.30 74 235 NaN \n", + "857200 4.81 83 258 NaN \n", + "607768 2.02 95 56 NaN \n", + "78687 9.51 66 11 NaN \n", + "141672 0.51 255 256 NaN \n", + "\n", + " pickupLatitude dropoffLongitude ... tripType month_num \\\n", + "1379860 40.775372 -73.899635 ... 1.0 1 \n", + "377548 40.709797 -73.946716 ... 1.0 1 \n", + "473976 40.741776 -73.878815 ... 1.0 1 \n", + "1246683 40.854698 -73.881821 ... 1.0 1 \n", + "1152261 40.767071 -73.933136 ... 1.0 1 \n", + "... ... ... ... ... ... \n", + "998273 NaN NaN ... 1.0 12 \n", + "857200 NaN NaN ... 1.0 12 \n", + "607768 NaN NaN ... 1.0 12 \n", + "78687 NaN NaN ... 1.0 12 \n", + "141672 NaN NaN ... 1.0 12 \n", + "\n", + " day_of_month day_of_week hour_of_day country_code hr_sin \\\n", + "1379860 14 3 6 US 1.000000 \n", + "377548 1 4 6 US 1.000000 \n", + "473976 8 4 20 US -0.866025 \n", + "1246683 15 4 8 US 0.866025 \n", + "1152261 9 5 4 US 0.866025 \n", + "... ... ... ... ... ... \n", + "998273 24 5 22 US -0.500000 \n", + "857200 3 5 20 US -0.866025 \n", + "607768 18 6 16 US -0.866025 \n", + "78687 6 1 9 US 0.707107 \n", + "141672 14 2 16 US -0.866025 \n", + "\n", + " hr_cos dy_sin dy_cos \n", + "1379860 6.123234e-17 0.433884 -0.900969 \n", + "377548 6.123234e-17 -0.433884 -0.900969 \n", + "473976 5.000000e-01 -0.433884 -0.900969 \n", + "1246683 -5.000000e-01 -0.433884 -0.900969 \n", + "1152261 5.000000e-01 -0.974928 -0.222521 \n", + "... ... ... ... \n", + "998273 8.660254e-01 -0.974928 -0.222521 \n", + "857200 5.000000e-01 -0.974928 -0.222521 \n", + "607768 -5.000000e-01 -0.781831 0.623490 \n", + "78687 -7.071068e-01 0.781831 0.623490 \n", + "141672 -5.000000e-01 0.974928 -0.222521 \n", + "\n", + "[24000 rows x 32 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def build_time_features(vector):\n", + " pickup_datetime = vector[0]\n", + " month_num = pickup_datetime.month\n", + " day_of_month = pickup_datetime.day\n", + " day_of_week = pickup_datetime.weekday()\n", + " hour_of_day = pickup_datetime.hour\n", + " country_code = \"US\"\n", + " hr_sin = np.sin(hour_of_day*(2.*np.pi/24))\n", + " hr_cos = np.cos(hour_of_day*(2.*np.pi/24))\n", + " dy_sin = np.sin(day_of_week*(2.*np.pi/7))\n", + " dy_cos = np.cos(day_of_week*(2.*np.pi/7))\n", + " \n", + " return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos))\n", + "\n", + "green_taxi_df[[\"month_num\", \"day_of_month\",\"day_of_week\", \"hour_of_day\", \"country_code\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]] = green_taxi_df[[\"lpepPickupDatetime\"]].apply(build_time_features, axis=1)\n", + "green_taxi_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove some of the columns that you won't need for modeling or additional feature building. Rename the time field for pickup time, and additionally convert the time to midnight using `pandas.Series.dt.normalize`. This is done to all time features so that the datetime column can be later used as a key when joining datasets together at a daily level of granularity." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDlpepPickupDatetimepassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_numday_of_monthday_of_weekhour_of_daycountry_codehr_sinhr_cosdy_sindy_cosdatetime
137986022016-01-14 06:39:0011.23-73.91182740.775372-73.89963540.7683337.311436US1.0000006.123234e-170.433884-0.9009692016-01-14
37754822016-01-01 06:22:0150.91-73.96204440.709797-73.94671640.7069026.31146US1.0000006.123234e-17-0.433884-0.9009692016-01-01
47397622016-01-08 20:55:4963.42-73.90482340.741776-73.87881540.71762512.818420US-0.8660255.000000e-01-0.433884-0.9009692016-01-08
124668322016-01-15 08:27:4113.99-73.91148440.854698-73.88182140.88213015.811548US0.866025-5.000000e-01-0.433884-0.9009692016-01-15
115226122016-01-09 04:35:2110.98-73.92177640.767071-73.93313640.7745678.01954US0.8660255.000000e-01-0.974928-0.2225212016-01-09
\n", + "
" + ], + "text/plain": [ + " vendorID lpepPickupDatetime passengerCount tripDistance \\\n", + "1379860 2 2016-01-14 06:39:00 1 1.23 \n", + "377548 2 2016-01-01 06:22:01 5 0.91 \n", + "473976 2 2016-01-08 20:55:49 6 3.42 \n", + "1246683 2 2016-01-15 08:27:41 1 3.99 \n", + "1152261 2 2016-01-09 04:35:21 1 0.98 \n", + "\n", + " pickupLongitude pickupLatitude dropoffLongitude dropoffLatitude \\\n", + "1379860 -73.911827 40.775372 -73.899635 40.768333 \n", + "377548 -73.962044 40.709797 -73.946716 40.706902 \n", + "473976 -73.904823 40.741776 -73.878815 40.717625 \n", + "1246683 -73.911484 40.854698 -73.881821 40.882130 \n", + "1152261 -73.921776 40.767071 -73.933136 40.774567 \n", + "\n", + " totalAmount month_num day_of_month day_of_week hour_of_day \\\n", + "1379860 7.3 1 14 3 6 \n", + "377548 6.3 1 1 4 6 \n", + "473976 12.8 1 8 4 20 \n", + "1246683 15.8 1 15 4 8 \n", + "1152261 8.0 1 9 5 4 \n", + "\n", + " country_code hr_sin hr_cos dy_sin dy_cos datetime \n", + "1379860 US 1.000000 6.123234e-17 0.433884 -0.900969 2016-01-14 \n", + "377548 US 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n", + "473976 US -0.866025 5.000000e-01 -0.433884 -0.900969 2016-01-08 \n", + "1246683 US 0.866025 -5.000000e-01 -0.433884 -0.900969 2016-01-15 \n", + "1152261 US 0.866025 5.000000e-01 -0.974928 -0.222521 2016-01-09 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "columns_to_remove = [\"lpepDropoffDatetime\", \"puLocationId\", \"doLocationId\", \"extra\", \"mtaTax\",\n", + " \"improvementSurcharge\", \"tollsAmount\", \"ehailFee\", \"tripType\", \"rateCodeID\", \n", + " \"storeAndFwdFlag\", \"paymentType\", \"fareAmount\", \"tipAmount\"]\n", + "\n", + "green_taxi_df.drop(columns_to_remove, axis=1, inplace=True)\n", + "\n", + "green_taxi_df[\"datetime\"] = green_taxi_df[\"lpepPickupDatetime\"].dt.normalize()\n", + "green_taxi_df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Enrich with Holiday Data\n", + "\n", + "Now that the taxi data is downloaded and roughly prepared, add in holiday data as additional features. Holiday-specific features will assist model accuracy, as major holidays are times where taxi demand increases dramatically and supply becomes limited. The holiday dataset is relatively small, so fetch the full set by using the `PublicHolidays` class constructor with no parameters for filtering. Preview the data to check the format." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpya4i60qp\\https%3A\\%2Fazureopendatastorage.azurefd.net\\holidaydatacontainer\\Processed\\part-00000-tid-8468414522853579044-35925ba8-a227-4b80-9c89-17065e7bf1db-649-c000.snappy.parquet\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryOrRegionholidayNamenormalizeHolidayNameisPaidTimeOffcountryRegionCodedate
19375ArgentinaAño Nuevo [New Year's Day]Año Nuevo [New Year's Day]NoneAR2008-01-01
19376AustraliaNew Year's DayNew Year's DayNoneAU2008-01-01
19377AustriaNeujahrNeujahrNoneAT2008-01-01
19378BelarusНовый годНовый годNoneBY2008-01-01
19379BelgiumNieuwjaarsdagNieuwjaarsdagNoneBE2008-01-01
\n", + "
" + ], + "text/plain": [ + " countryOrRegion holidayName normalizeHolidayName \\\n", + "19375 Argentina Año Nuevo [New Year's Day] Año Nuevo [New Year's Day] \n", + "19376 Australia New Year's Day New Year's Day \n", + "19377 Austria Neujahr Neujahr \n", + "19378 Belarus Новый год Новый год \n", + "19379 Belgium Nieuwjaarsdag Nieuwjaarsdag \n", + "\n", + " isPaidTimeOff countryRegionCode date \n", + "19375 None AR 2008-01-01 \n", + "19376 None AU 2008-01-01 \n", + "19377 None AT 2008-01-01 \n", + "19378 None BY 2008-01-01 \n", + "19379 None BE 2008-01-01 " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from azureml.opendatasets import PublicHolidays\n", + "\n", + "# call default constructor to download full dataset\n", + "holidays_df = PublicHolidays().to_pandas_dataframe()\n", + "holidays_df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rename the `countryRegionCode` and `date` columns to match the respective field names from the taxi data, and also normalize the time so it can be used as a key. Next, join the holiday data with the taxi data by performing a left-join using the Pandas `merge()` function. This will preserve all records from `green_taxi_df`, but add in holiday data where it exists for the corresponding `datetime` and `country_code`, which in this case is always `\\\"US\\\"`. Preview the data to verify that they were merged correctly." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDlpepPickupDatetimepassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_num...day_of_weekhour_of_daycountry_codehr_sinhr_cosdy_sindy_cosdatetimenormalizeHolidayNameisPaidTimeOff
122016-01-01 06:22:0150.91-73.96204440.709797-73.94671640.7069026.301...46US1.0000006.123234e-17-0.433884-0.9009692016-01-01New Year's DayTrue
2522016-01-01 06:14:4312.44-73.99357640.681519-73.99959640.65593010.301...46US1.0000006.123234e-17-0.433884-0.9009692016-01-01New Year's DayTrue
2722016-01-01 16:06:3314.57-73.96250940.687862-73.98136140.73275822.251...416US-0.866025-5.000000e-01-0.433884-0.9009692016-01-01New Year's DayTrue
4422016-01-18 11:46:27116.10-73.92552240.827877-73.93498240.68127850.301...011US0.258819-9.659258e-010.0000001.0000002016-01-18Martin Luther King Jr. DayNone
4522016-01-01 10:41:3913.33-73.96289140.711971-73.91806040.73683212.801...410US0.500000-8.660254e-01-0.433884-0.9009692016-01-01New Year's DayTrue
..................................................................
2386822016-12-25 00:21:2312.36NaNNaNNaNNaN12.3012...60US0.0000001.000000e+00-0.7818310.6234902016-12-25Christmas DayTrue
2389222016-12-25 14:05:4811.05NaNNaNNaNNaN12.3012...614US-0.500000-8.660254e-01-0.7818310.6234902016-12-25Christmas DayTrue
2394212016-12-26 01:43:5710.80NaNNaNNaNNaN7.5512...01US0.2588199.659258e-010.0000001.0000002016-12-26Christmas DayTrue
2397822016-12-26 03:38:3311.55NaNNaNNaNNaN8.3012...03US0.7071077.071068e-010.0000001.0000002016-12-26Christmas DayTrue
2398522016-12-26 22:12:1813.77NaNNaNNaNNaN16.2512...022US-0.5000008.660254e-010.0000001.0000002016-12-26Christmas DayTrue
\n", + "

673 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " vendorID lpepPickupDatetime passengerCount tripDistance \\\n", + "1 2 2016-01-01 06:22:01 5 0.91 \n", + "25 2 2016-01-01 06:14:43 1 2.44 \n", + "27 2 2016-01-01 16:06:33 1 4.57 \n", + "44 2 2016-01-18 11:46:27 1 16.10 \n", + "45 2 2016-01-01 10:41:39 1 3.33 \n", + "... ... ... ... ... \n", + "23868 2 2016-12-25 00:21:23 1 2.36 \n", + "23892 2 2016-12-25 14:05:48 1 1.05 \n", + "23942 1 2016-12-26 01:43:57 1 0.80 \n", + "23978 2 2016-12-26 03:38:33 1 1.55 \n", + "23985 2 2016-12-26 22:12:18 1 3.77 \n", + "\n", + " pickupLongitude pickupLatitude dropoffLongitude dropoffLatitude \\\n", + "1 -73.962044 40.709797 -73.946716 40.706902 \n", + "25 -73.993576 40.681519 -73.999596 40.655930 \n", + "27 -73.962509 40.687862 -73.981361 40.732758 \n", + "44 -73.925522 40.827877 -73.934982 40.681278 \n", + "45 -73.962891 40.711971 -73.918060 40.736832 \n", + "... ... ... ... ... \n", + "23868 NaN NaN NaN NaN \n", + "23892 NaN NaN NaN NaN \n", + "23942 NaN NaN NaN NaN \n", + "23978 NaN NaN NaN NaN \n", + "23985 NaN NaN NaN NaN \n", + "\n", + " totalAmount month_num ... day_of_week hour_of_day country_code \\\n", + "1 6.30 1 ... 4 6 US \n", + "25 10.30 1 ... 4 6 US \n", + "27 22.25 1 ... 4 16 US \n", + "44 50.30 1 ... 0 11 US \n", + "45 12.80 1 ... 4 10 US \n", + "... ... ... ... ... ... ... \n", + "23868 12.30 12 ... 6 0 US \n", + "23892 12.30 12 ... 6 14 US \n", + "23942 7.55 12 ... 0 1 US \n", + "23978 8.30 12 ... 0 3 US \n", + "23985 16.25 12 ... 0 22 US \n", + "\n", + " hr_sin hr_cos dy_sin dy_cos datetime \\\n", + "1 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n", + "25 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n", + "27 -0.866025 -5.000000e-01 -0.433884 -0.900969 2016-01-01 \n", + "44 0.258819 -9.659258e-01 0.000000 1.000000 2016-01-18 \n", + "45 0.500000 -8.660254e-01 -0.433884 -0.900969 2016-01-01 \n", + "... ... ... ... ... ... \n", + "23868 0.000000 1.000000e+00 -0.781831 0.623490 2016-12-25 \n", + "23892 -0.500000 -8.660254e-01 -0.781831 0.623490 2016-12-25 \n", + "23942 0.258819 9.659258e-01 0.000000 1.000000 2016-12-26 \n", + "23978 0.707107 7.071068e-01 0.000000 1.000000 2016-12-26 \n", + "23985 -0.500000 8.660254e-01 0.000000 1.000000 2016-12-26 \n", + "\n", + " normalizeHolidayName isPaidTimeOff \n", + "1 New Year's Day True \n", + "25 New Year's Day True \n", + "27 New Year's Day True \n", + "44 Martin Luther King Jr. Day None \n", + "45 New Year's Day True \n", + "... ... ... \n", + "23868 Christmas Day True \n", + "23892 Christmas Day True \n", + "23942 Christmas Day True \n", + "23978 Christmas Day True \n", + "23985 Christmas Day True \n", + "\n", + "[673 rows x 21 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "holidays_df = holidays_df.rename(columns={\"countryRegionCode\": \"country_code\"})\n", + "holidays_df[\"datetime\"] = holidays_df[\"date\"].dt.normalize()\n", + "\n", + "holidays_df.drop([\"countryOrRegion\", \"holidayName\", \"date\"], axis=1, inplace=True)\n", + "\n", + "taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how=\"left\", on=[\"datetime\", \"country_code\"])\n", + "taxi_holidays_df[taxi_holidays_df[\"normalizeHolidayName\"].notnull()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Enrich with weather data\n", + "\n", + "Now NOAA surface weather data can be appended to the taxi and holiday data. Use a similar approach to fetch the weather data by downloading one month at a time iteratively. Additionally, specify the `cols` parameter with an array of strings to filter the columns to download. This is a very large dataset containing weather surface data from all over the world, so before appending each month, filter the lat/long fields to near NYC using the `query()` function on the dataframe. This will ensure the `weather_df` doesn't get too large." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-12.c000.snappy.parquet\n" + ] + } + ], + "source": [ + "from azureml.opendatasets import NoaaIsdWeather\n", + "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n", + "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n", + "\n", + "weather_df = pd.concat([NoaaIsdWeather(cols=[\"temperature\", \"precipTime\", \"precipDepth\"], start_date=start + relativedelta(months=x), end_date=end + relativedelta(months=x))\\\n", + " .to_pandas_dataframe().query(\"latitude>=40.53 and latitude<=40.88 and longitude>=-74.09 and longitude<=-73.72 and temperature==temperature\") for x in range(12)])" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
wbanlatitudetemperatureusafdatetimelongitudeprecipDepthprecipTime
2046471473240.7832.87250302016-01-02 03:00:00-73.867NaNNaN
2046701473240.779-4.47250302016-01-22 13:51:00-73.8800.01.0
2046941473240.7795.07250302016-01-08 02:51:00-73.8800.01.0
2047011473240.779-1.17250302016-01-04 15:51:00-73.8800.01.0
2047151473240.7794.47250302016-01-01 21:51:00-73.8800.01.0
...........................
12484719472840.7894.47250532016-12-23 13:51:00-73.9670.01.0
12485559472840.7895.07250532016-12-12 13:51:00-73.9670.01.0
12485809472840.7893.97250532016-12-18 07:01:00-73.967NaNNaN
12485979472840.7897.87250532016-12-25 00:51:00-73.9670.01.0
12486009472840.789-2.87250532016-12-17 11:10:00-73.9675.01.0
\n", + "

55683 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " wban latitude temperature usaf datetime longitude \\\n", + "204647 14732 40.783 2.8 725030 2016-01-02 03:00:00 -73.867 \n", + "204670 14732 40.779 -4.4 725030 2016-01-22 13:51:00 -73.880 \n", + "204694 14732 40.779 5.0 725030 2016-01-08 02:51:00 -73.880 \n", + "204701 14732 40.779 -1.1 725030 2016-01-04 15:51:00 -73.880 \n", + "204715 14732 40.779 4.4 725030 2016-01-01 21:51:00 -73.880 \n", + "... ... ... ... ... ... ... \n", + "1248471 94728 40.789 4.4 725053 2016-12-23 13:51:00 -73.967 \n", + "1248555 94728 40.789 5.0 725053 2016-12-12 13:51:00 -73.967 \n", + "1248580 94728 40.789 3.9 725053 2016-12-18 07:01:00 -73.967 \n", + "1248597 94728 40.789 7.8 725053 2016-12-25 00:51:00 -73.967 \n", + "1248600 94728 40.789 -2.8 725053 2016-12-17 11:10:00 -73.967 \n", + "\n", + " precipDepth precipTime \n", + "204647 NaN NaN \n", + "204670 0.0 1.0 \n", + "204694 0.0 1.0 \n", + "204701 0.0 1.0 \n", + "204715 0.0 1.0 \n", + "... ... ... \n", + "1248471 0.0 1.0 \n", + "1248555 0.0 1.0 \n", + "1248580 NaN NaN \n", + "1248597 0.0 1.0 \n", + "1248600 5.0 1.0 \n", + "\n", + "[55683 rows x 8 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weather_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again call `pandas.Series.dt.normalize` on the `datetime` field in the weather data so it matches the time key in `taxi_holidays_df`.\n", + "\n", + "\n", + "Next group the weather data to have daily aggregated weather values. Define a dict `aggregations` to define how to aggregate each field at a daily level. For`temperature` take the mean and for `precipTime` and `precipDepth` take the daily maximum. Use the `groupby()` function along with the aggregations to group the data. Preview the data to ensure there is one record per day." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
precipTimetemperatureprecipDepth
datetime
2016-01-011.05.1973450.0
2016-01-021.02.5678570.0
2016-01-031.03.8464290.0
2016-01-041.00.1238940.0
2016-01-056.0-7.2062500.0
2016-01-066.0-0.8963960.0
2016-01-076.03.1806450.0
2016-01-081.04.3840910.0
2016-01-096.06.7102743.0
2016-01-1024.010.943655254.0
\n", + "
" + ], + "text/plain": [ + " precipTime temperature precipDepth\n", + "datetime \n", + "2016-01-01 1.0 5.197345 0.0\n", + "2016-01-02 1.0 2.567857 0.0\n", + "2016-01-03 1.0 3.846429 0.0\n", + "2016-01-04 1.0 0.123894 0.0\n", + "2016-01-05 6.0 -7.206250 0.0\n", + "2016-01-06 6.0 -0.896396 0.0\n", + "2016-01-07 6.0 3.180645 0.0\n", + "2016-01-08 1.0 4.384091 0.0\n", + "2016-01-09 6.0 6.710274 3.0\n", + "2016-01-10 24.0 10.943655 254.0" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weather_df[\"datetime\"] = weather_df[\"datetime\"].dt.normalize()\n", + "\n", + "# group by datetime\n", + "aggregations = {\"precipTime\": \"max\", \"temperature\": \"mean\", \"precipDepth\": \"max\"}\n", + "weather_df_grouped = weather_df.groupby(\"datetime\").agg(aggregations)\n", + "weather_df_grouped.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: The examples in this tutorial merge data using Pandas functions and custom aggregations, but the Open Datasets SDK has classes designed to easily merge and enrich data sets. See the [notebook](https://github.com/Azure/OpenDatasetsNotebooks/blob/master/tutorials/data-join/04-nyc-taxi-join-weather-in-pandas.ipynb) for code examples of these design patterns." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cleanse data\n", + "\n", + "Merge the existing taxi and holiday data with the new weather data. This time `datetime` is the only key, and again perform a left-join of the data. Run the `describe()` function on the new dataframe to see summary statistics for each field." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDpassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_numday_of_monthday_of_weekhour_of_dayhr_sinhr_cosdy_sindy_cosprecipTimetemperatureprecipDepth
count24000.00000024000.00000024000.00000012000.00000012000.00000012000.00000012000.00000024000.00000024000.00000024000.00000024000.00000024000.00000024000.0000002.400000e+0424000.00000024000.00000024000.00000024000.00000024000.000000
mean1.7896671.3552922.830398-73.81439340.678791-73.83701940.69072914.6682516.50000015.0687503.24779213.582875-0.239687-1.510585e-02-0.079292-0.05963013.31866713.8782721037.956292
std0.4075541.0200183.1183023.0163851.6631522.6986091.48803211.7385323.4521248.4775551.9512096.7083720.6675287.048175e-010.7144570.69264010.3331629.4844432788.844868
min1.0000000.0000000.000000-74.1648250.000000-75.1864400.000000-200.0000001.0000001.0000000.0000000.000000-1.000000-1.000000e+00-0.974928-0.9009691.000000-13.3794640.000000
25%2.0000001.0000001.040000-73.96137040.693539-73.96751440.6951287.8800003.7500008.0000002.0000009.000000-0.866025-7.071068e-01-0.781831-0.9009696.0000006.6207730.000000
50%2.0000001.0000001.840000-73.94713240.745928-73.94586940.74591411.3000006.50000015.0000003.00000015.000000-0.500000-1.836970e-160.000000-0.2225216.00000013.10832310.000000
75%2.0000001.0000003.500000-73.91963840.802049-73.91305940.79107617.7500009.25000022.0000005.00000019.0000000.2588197.071068e-010.7818310.62349024.00000022.944737127.000000
max2.0000007.000000106.6800000.00000041.0810470.00000041.081055450.00000012.00000030.0000006.00000023.0000001.0000001.000000e+000.9749281.00000024.00000031.3036659999.000000
\n", + "
" + ], + "text/plain": [ + " vendorID passengerCount tripDistance pickupLongitude \\\n", + "count 24000.000000 24000.000000 24000.000000 12000.000000 \n", + "mean 1.789667 1.355292 2.830398 -73.814393 \n", + "std 0.407554 1.020018 3.118302 3.016385 \n", + "min 1.000000 0.000000 0.000000 -74.164825 \n", + "25% 2.000000 1.000000 1.040000 -73.961370 \n", + "50% 2.000000 1.000000 1.840000 -73.947132 \n", + "75% 2.000000 1.000000 3.500000 -73.919638 \n", + "max 2.000000 7.000000 106.680000 0.000000 \n", + "\n", + " pickupLatitude dropoffLongitude dropoffLatitude totalAmount \\\n", + "count 12000.000000 12000.000000 12000.000000 24000.000000 \n", + "mean 40.678791 -73.837019 40.690729 14.668251 \n", + "std 1.663152 2.698609 1.488032 11.738532 \n", + "min 0.000000 -75.186440 0.000000 -200.000000 \n", + "25% 40.693539 -73.967514 40.695128 7.880000 \n", + "50% 40.745928 -73.945869 40.745914 11.300000 \n", + "75% 40.802049 -73.913059 40.791076 17.750000 \n", + "max 41.081047 0.000000 41.081055 450.000000 \n", + "\n", + " month_num day_of_month day_of_week hour_of_day hr_sin \\\n", + "count 24000.000000 24000.000000 24000.000000 24000.000000 24000.000000 \n", + "mean 6.500000 15.068750 3.247792 13.582875 -0.239687 \n", + "std 3.452124 8.477555 1.951209 6.708372 0.667528 \n", + "min 1.000000 1.000000 0.000000 0.000000 -1.000000 \n", + "25% 3.750000 8.000000 2.000000 9.000000 -0.866025 \n", + "50% 6.500000 15.000000 3.000000 15.000000 -0.500000 \n", + "75% 9.250000 22.000000 5.000000 19.000000 0.258819 \n", + "max 12.000000 30.000000 6.000000 23.000000 1.000000 \n", + "\n", + " hr_cos dy_sin dy_cos precipTime temperature \\\n", + "count 2.400000e+04 24000.000000 24000.000000 24000.000000 24000.000000 \n", + "mean -1.510585e-02 -0.079292 -0.059630 13.318667 13.878272 \n", + "std 7.048175e-01 0.714457 0.692640 10.333162 9.484443 \n", + "min -1.000000e+00 -0.974928 -0.900969 1.000000 -13.379464 \n", + "25% -7.071068e-01 -0.781831 -0.900969 6.000000 6.620773 \n", + "50% -1.836970e-16 0.000000 -0.222521 6.000000 13.108323 \n", + "75% 7.071068e-01 0.781831 0.623490 24.000000 22.944737 \n", + "max 1.000000e+00 0.974928 1.000000 24.000000 31.303665 \n", + "\n", + " precipDepth \n", + "count 24000.000000 \n", + "mean 1037.956292 \n", + "std 2788.844868 \n", + "min 0.000000 \n", + "25% 0.000000 \n", + "50% 10.000000 \n", + "75% 127.000000 \n", + "max 9999.000000 " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how=\"left\", on=[\"datetime\"])\n", + "taxi_holidays_weather_df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the summary statistics, you see that there are several fields that have outliers or values that will reduce model accuracy. First filter the lat/long fields to be within the same bounds you used for filtering weather data. The `tripDistance` field has some bad data, because the minimum value is negative. The `passengerCount` field has bad data as well, with the max value being 210 passengers. Lastly, the `totalAmount` field has negative values, which don't make sense in the context of our model.\n", + "\n", + "Filter out these anomolies using query functions, and then remove the last few columns unnecesary for training.\n", + "\n", + "Note: since a random sample of 2000 was taken for each month of the taxi data, the statistics may vary each time this is ran." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "final_df = taxi_holidays_weather_df.query(\"pickupLatitude>=40.53 and pickupLatitude<=40.88 and \\\n", + " pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \\\n", + " tripDistance>0 and tripDistance<75 and \\\n", + " passengerCount>0 and passengerCount<100 and \\\n", + " totalAmount>0\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Call `describe()` again on the data to ensure cleansing worked as expected. The final data is prepared and cleansed, consisting of taxi, holiday, and weather data, and is ready to use for machine learning model training." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDpassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_numday_of_monthday_of_weekhour_of_dayhr_sinhr_cosdy_sindy_cosprecipTimetemperatureprecipDepth
count11763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.0000001.176300e+0411763.00000011763.00000011763.00000011763.00000011763.000000
mean1.7901901.3692942.841407-73.93791140.746224-73.91090140.73081814.5579173.50131814.9292703.25231713.538553-0.236544-2.265927e-03-0.070226-0.05905911.99396410.288261192.179546
std0.4071911.0416342.8298640.0411210.0568181.3641140.7534689.9891651.7073508.4757931.9481276.7780120.6688127.048492e-010.7188710.68912210.1147758.5300111223.101074
min1.0000001.0000000.010000-74.03519440.572906-74.1830290.0000000.0100001.0000001.0000000.0000000.000000-1.000000-1.000000e+00-0.974928-0.9009691.000000-13.3794640.000000
25%2.0000001.0000001.090000-73.96160140.693594-73.96779340.6954408.1600002.0000008.0000002.0000009.000000-0.866025-7.071068e-01-0.781831-0.9009691.0000003.5045800.000000
50%2.0000001.0000001.900000-73.94751740.745842-73.94624340.74578911.3000004.00000015.0000003.00000015.000000-0.500000-1.836970e-160.000000-0.2225216.00000010.4682763.000000
75%2.0000001.0000003.530000-73.92050940.801752-73.91380740.78994217.3800005.00000022.0000005.00000019.0000000.2588197.071068e-010.7818310.62349024.00000016.96692341.000000
max2.0000006.00000038.850000-73.73889940.8799820.00000041.073185123.8000006.00000030.0000006.00000023.0000001.0000001.000000e+000.9749281.00000024.00000026.5241079999.000000
\n", + "
" + ], + "text/plain": [ + " vendorID passengerCount tripDistance pickupLongitude \\\n", + "count 11763.000000 11763.000000 11763.000000 11763.000000 \n", + "mean 1.790190 1.369294 2.841407 -73.937911 \n", + "std 0.407191 1.041634 2.829864 0.041121 \n", + "min 1.000000 1.000000 0.010000 -74.035194 \n", + "25% 2.000000 1.000000 1.090000 -73.961601 \n", + "50% 2.000000 1.000000 1.900000 -73.947517 \n", + "75% 2.000000 1.000000 3.530000 -73.920509 \n", + "max 2.000000 6.000000 38.850000 -73.738899 \n", + "\n", + " pickupLatitude dropoffLongitude dropoffLatitude totalAmount \\\n", + "count 11763.000000 11763.000000 11763.000000 11763.000000 \n", + "mean 40.746224 -73.910901 40.730818 14.557917 \n", + "std 0.056818 1.364114 0.753468 9.989165 \n", + "min 40.572906 -74.183029 0.000000 0.010000 \n", + "25% 40.693594 -73.967793 40.695440 8.160000 \n", + "50% 40.745842 -73.946243 40.745789 11.300000 \n", + "75% 40.801752 -73.913807 40.789942 17.380000 \n", + "max 40.879982 0.000000 41.073185 123.800000 \n", + "\n", + " month_num day_of_month day_of_week hour_of_day hr_sin \\\n", + "count 11763.000000 11763.000000 11763.000000 11763.000000 11763.000000 \n", + "mean 3.501318 14.929270 3.252317 13.538553 -0.236544 \n", + "std 1.707350 8.475793 1.948127 6.778012 0.668812 \n", + "min 1.000000 1.000000 0.000000 0.000000 -1.000000 \n", + "25% 2.000000 8.000000 2.000000 9.000000 -0.866025 \n", + "50% 4.000000 15.000000 3.000000 15.000000 -0.500000 \n", + "75% 5.000000 22.000000 5.000000 19.000000 0.258819 \n", + "max 6.000000 30.000000 6.000000 23.000000 1.000000 \n", + "\n", + " hr_cos dy_sin dy_cos precipTime temperature \\\n", + "count 1.176300e+04 11763.000000 11763.000000 11763.000000 11763.000000 \n", + "mean -2.265927e-03 -0.070226 -0.059059 11.993964 10.288261 \n", + "std 7.048492e-01 0.718871 0.689122 10.114775 8.530011 \n", + "min -1.000000e+00 -0.974928 -0.900969 1.000000 -13.379464 \n", + "25% -7.071068e-01 -0.781831 -0.900969 1.000000 3.504580 \n", + "50% -1.836970e-16 0.000000 -0.222521 6.000000 10.468276 \n", + "75% 7.071068e-01 0.781831 0.623490 24.000000 16.966923 \n", + "max 1.000000e+00 0.974928 1.000000 24.000000 26.524107 \n", + "\n", + " precipDepth \n", + "count 11763.000000 \n", + "mean 192.179546 \n", + "std 1223.101074 \n", + "min 0.000000 \n", + "25% 0.000000 \n", + "50% 3.000000 \n", + "75% 41.000000 \n", + "max 9999.000000 " + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train a model\n", + "\n", + "The data is ready to train a machine learning model." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.linear_model import RidgeCV\n", + "from sklearn.linear_model import Ridge\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training Function\n", + "\n", + "Define a function that can be used to create a model pipeline that can be trained and then used for scoring. This pipeline has 2 steps: preprocessing and model training.\n", + "\n", + "Preprocessing Stages:\n", + "The preprocessing step of the pipeline also has 2 stages, one for numerical features and one for categorical features.\n", + "For the numerical features, let's fill in any blanks with 0's. While the training data may not have any nulls in the these fields, future data that is scored may and this step will take care of those for us. Optionally, a scaler transformation could be added in this step as well. Similarly for the categorical variables, let's have the null values filled with \"MISSING\". Additionally to the categorical variables, these will need to be one hot encoded, so we will include that step in our pipeline.\n", + "\n", + "Model Training Stage:\n", + "An input parameter will determine which type of model of train. Let's test out a linear regression and random forest model to start. \n", + "\n", + "The two steps are put together into the pipeline which is what the function is returning." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "def createClassModel(algo_name, catg, nums):\n", + " numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])\n", + " \n", + " categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=\"MISSING\")), ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n", + " \n", + " preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)])\n", + " \n", + " if algo_name == 'linear_regression':\n", + " model=Ridge(alpha=100)\n", + " elif algo_name == 'random_forest':\n", + " model = RandomForestRegressor()\n", + " else:\n", + " pass\n", + " ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), (\"model\", model)])\n", + " return ModelPipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define the arguments that will be passed to the function. `catg_cols` is a list of the categorical variables that will be transformed in our processing step. `num_cols` is a list of the numerical variables that will be transformed in our processing step. Let's define the target column as `label` so it can be used in future steps as well." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "catg_cols = [\"vendorID\", \"month_num\", \"day_of_month\", \"normalizeHolidayName\", \"isPaidTimeOff\"]\n", + "num_cols = [\"passengerCount\", \"tripDistance\", \"precipTime\", \"temperature\", \"precipDepth\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]\n", + "label = [\"totalAmount\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The training is ready to begin, but first, let's make sure that the categorical variables are strings in our dataframe to ensure no errors in our pipeline. \n", + "\n", + "Next, the data is split into training and test sets by using the `train_test_split()` function in the `scikit-learn` library. The `test_size` parameter determines the percentage of data to allocate to testing. The `random_state` parameter sets a seed to the random number generator, so that your train-test splits are deterministic.\n", + "\n", + "The training will happen in the for loop so that both algorithms can be tested. The createClassModel funtion is called to retreive the pipeline that can then be trained using the training dataset. \n", + "\n", + "Once trained, the test dataset is then ran through the model to test the model's performance. Using various functions from sklearn.metrics, the R2 score, MAPE, and RMSE can be used to measure model performance." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "linear_regression\n", + "R2: 0.8034971051723139\n", + "MAPE: 0.15888983234876766\n", + "RMSE: 4.606544019524053\n", + "\n", + "random_forest\n", + "R2: 0.8073017231520601\n", + "MAPE: 0.14715914748857337\n", + "RMSE: 4.5617309259357475\n", + "\n" + ] + } + ], + "source": [ + "# make sure categorical columns are strings\n", + "final_df[catg_cols] = final_df[catg_cols].astype(\"str\")\n", + "\n", + "# split data\n", + "X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222)\n", + "\n", + "# test 2 algorithms\n", + "for algorithmname in [\"linear_regression\", 'random_forest']:\n", + " fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline\n", + " fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine\n", + "\n", + " y_pred = fitPipeline.predict(X_test) # score with fitted pipeline\n", + "\n", + " # Evaluate\n", + " r2 = r2_score(y_test, y_pred)\n", + " mape = mean_absolute_percentage_error(y_test, y_pred)\n", + " rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n", + "\n", + " print(algorithmname)\n", + " print(\"R2:\", r2)\n", + " print(\"MAPE:\", mape)\n", + " print(\"RMSE:\", rmse)\n", + " print()" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "74e9702761b8f12846716a18132904990016d49f378e22e0e13a0e91318de754" + }, + "kernelspec": { + "display_name": "Python 3.8.12 ('mlopsenv')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/workshop/notebooks/taxi-tutorial.ipynb b/src/workshop/notebooks/taxi-tutorial.ipynb index 41795d69..b519258e 100644 --- a/src/workshop/notebooks/taxi-tutorial.ipynb +++ b/src/workshop/notebooks/taxi-tutorial.ipynb @@ -1,2898 +1,2898 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Tutorial: Build a regression model with Open Datasets\n", - "\n", - "In this tutorial, you leverage the convenience of Azure Open Datasets to create a regression model to predict NYC taxi fare prices. Easily download publicly available taxi, holiday and weather data to create a dataset that can train a regression model using sklearn." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.opendatasets import NycTlcGreen\n", - "import pandas as pd\n", - "import numpy as np\n", - "from datetime import datetime\n", - "from dateutil.relativedelta import relativedelta\n", - "\n", - "pd.options.mode.chained_assignment = None" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download Data\n", - "Begin by downloading the NYC Taxi dataset from Azure Open Datasets. In non-Spark environments, Open Datasets only allows one month of data at a time with certain classes to avoid MemoryError with large datasets. To download 1 year of taxi data, we will fetch 2000 random samples from each month.\n", - "\n", - "Note: Open Datasets has mirroring classes for working in Spark where data size and memory are not a concern." - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpviwf6gni\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=1\\part-00119-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2689-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp6e1co7l5\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=2\\part-00060-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2630-2.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpd5lgxojh\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=3\\part-00196-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2766-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpela340gr\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=4\\part-00121-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2691-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpe79pzv2_\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=5\\part-00044-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2614-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpyxyv_8h4\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=6\\part-00108-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2678-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp498a1aem\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=7\\part-00020-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2590-2.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpuhi_se7a\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=8\\part-00172-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2742-2.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpd7id7xon\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=9\\part-00076-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2646-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp3he0z_qe\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=10\\part-00090-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2660-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp1sa8wuxl\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=11\\part-00021-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2591-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp1e7uekhr\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=12\\part-00116-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2686-1.c000.snappy.parquet\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vendorIDlpepPickupDatetimelpepDropoffDatetimepassengerCounttripDistancepuLocationIddoLocationIdpickupLongitudepickupLatitudedropoffLongitude...paymentTypefareAmountextramtaTaximprovementSurchargetipAmounttollsAmountehailFeetotalAmounttripType
137986022016-01-14 06:39:002016-01-14 06:44:5511.23NoneNone-73.91182740.775372-73.899635...26.50.00.50.30.000.0NaN7.301.0
37754822016-01-01 06:22:012016-01-01 06:27:1450.91NoneNone-73.96204440.709797-73.946716...25.50.00.50.30.000.0NaN6.301.0
47397622016-01-08 20:55:492016-01-08 21:05:5063.42NoneNone-73.90482340.741776-73.878815...211.50.50.50.30.000.0NaN12.801.0
124668322016-01-15 08:27:412016-01-15 08:41:0513.99NoneNone-73.91148440.854698-73.881821...215.00.00.50.30.000.0NaN15.801.0
115226122016-01-09 04:35:212016-01-09 04:41:0210.98NoneNone-73.92177640.767071-73.933136...16.00.50.50.30.700.0NaN8.001.0
..................................................................
99827312016-12-24 22:03:252016-12-24 22:17:1615.3074235NaNNaNNaN...216.50.50.50.30.000.0NaN17.801.0
85720022016-12-03 20:33:532016-12-03 20:53:5114.8183258NaNNaNNaN...118.50.50.50.33.000.0NaN22.801.0
60776822016-12-18 16:17:542016-12-18 16:33:1332.029556NaNNaNNaN...211.50.00.50.30.000.0NaN12.301.0
7868722016-12-06 09:24:432016-12-06 09:41:0919.516611NaNNaNNaN...227.50.00.50.30.000.0NaN28.301.0
14167222016-12-14 16:12:342016-12-14 16:15:1110.51255256NaNNaNNaN...14.01.00.50.31.450.0NaN7.251.0
\n", - "

24000 rows × 23 columns

\n", - "
" - ], - "text/plain": [ - " vendorID lpepPickupDatetime lpepDropoffDatetime passengerCount \\\n", - "1379860 2 2016-01-14 06:39:00 2016-01-14 06:44:55 1 \n", - "377548 2 2016-01-01 06:22:01 2016-01-01 06:27:14 5 \n", - "473976 2 2016-01-08 20:55:49 2016-01-08 21:05:50 6 \n", - "1246683 2 2016-01-15 08:27:41 2016-01-15 08:41:05 1 \n", - "1152261 2 2016-01-09 04:35:21 2016-01-09 04:41:02 1 \n", - "... ... ... ... ... \n", - "998273 1 2016-12-24 22:03:25 2016-12-24 22:17:16 1 \n", - "857200 2 2016-12-03 20:33:53 2016-12-03 20:53:51 1 \n", - "607768 2 2016-12-18 16:17:54 2016-12-18 16:33:13 3 \n", - "78687 2 2016-12-06 09:24:43 2016-12-06 09:41:09 1 \n", - "141672 2 2016-12-14 16:12:34 2016-12-14 16:15:11 1 \n", - "\n", - " tripDistance puLocationId doLocationId pickupLongitude \\\n", - "1379860 1.23 None None -73.911827 \n", - "377548 0.91 None None -73.962044 \n", - "473976 3.42 None None -73.904823 \n", - "1246683 3.99 None None -73.911484 \n", - "1152261 0.98 None None -73.921776 \n", - "... ... ... ... ... \n", - "998273 5.30 74 235 NaN \n", - "857200 4.81 83 258 NaN \n", - "607768 2.02 95 56 NaN \n", - "78687 9.51 66 11 NaN \n", - "141672 0.51 255 256 NaN \n", - "\n", - " pickupLatitude dropoffLongitude ... paymentType fareAmount extra \\\n", - "1379860 40.775372 -73.899635 ... 2 6.5 0.0 \n", - "377548 40.709797 -73.946716 ... 2 5.5 0.0 \n", - "473976 40.741776 -73.878815 ... 2 11.5 0.5 \n", - "1246683 40.854698 -73.881821 ... 2 15.0 0.0 \n", - "1152261 40.767071 -73.933136 ... 1 6.0 0.5 \n", - "... ... ... ... ... ... ... \n", - "998273 NaN NaN ... 2 16.5 0.5 \n", - "857200 NaN NaN ... 1 18.5 0.5 \n", - "607768 NaN NaN ... 2 11.5 0.0 \n", - "78687 NaN NaN ... 2 27.5 0.0 \n", - "141672 NaN NaN ... 1 4.0 1.0 \n", - "\n", - " mtaTax improvementSurcharge tipAmount tollsAmount ehailFee \\\n", - "1379860 0.5 0.3 0.00 0.0 NaN \n", - "377548 0.5 0.3 0.00 0.0 NaN \n", - "473976 0.5 0.3 0.00 0.0 NaN \n", - "1246683 0.5 0.3 0.00 0.0 NaN \n", - "1152261 0.5 0.3 0.70 0.0 NaN \n", - "... ... ... ... ... ... \n", - "998273 0.5 0.3 0.00 0.0 NaN \n", - "857200 0.5 0.3 3.00 0.0 NaN \n", - "607768 0.5 0.3 0.00 0.0 NaN \n", - "78687 0.5 0.3 0.00 0.0 NaN \n", - "141672 0.5 0.3 1.45 0.0 NaN \n", - "\n", - " totalAmount tripType \n", - "1379860 7.30 1.0 \n", - "377548 6.30 1.0 \n", - "473976 12.80 1.0 \n", - "1246683 15.80 1.0 \n", - "1152261 8.00 1.0 \n", - "... ... ... \n", - "998273 17.80 1.0 \n", - "857200 22.80 1.0 \n", - "607768 12.30 1.0 \n", - "78687 28.30 1.0 \n", - "141672 7.25 1.0 \n", - "\n", - "[24000 rows x 23 columns]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n", - "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n", - "\n", - "green_taxi_df = pd.concat([NycTlcGreen(start + relativedelta(months=x), end + relativedelta(months=x)) \\\n", - " .to_pandas_dataframe().sample(2000) for x in range(12)])\n", - "green_taxi_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that the initial data is loaded, define a function to create various time-based features from the pickup datetime field. This will create new fields for the month number, day of month, day of week, and hour of day. From those, we calculate the sin and cosine transformations to capture the cyclical nature of the variable which will allow the model to factor in time-based seasonality. This function also adds a static feature for the country code to join the holiday data. Use the apply() function on the dataframe to interatively apply this function to each row in the dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vendorIDlpepPickupDatetimelpepDropoffDatetimepassengerCounttripDistancepuLocationIddoLocationIdpickupLongitudepickupLatitudedropoffLongitude...tripTypemonth_numday_of_monthday_of_weekhour_of_daycountry_codehr_sinhr_cosdy_sindy_cos
137986022016-01-14 06:39:002016-01-14 06:44:5511.23NoneNone-73.91182740.775372-73.899635...1.011436US1.0000006.123234e-170.433884-0.900969
37754822016-01-01 06:22:012016-01-01 06:27:1450.91NoneNone-73.96204440.709797-73.946716...1.01146US1.0000006.123234e-17-0.433884-0.900969
47397622016-01-08 20:55:492016-01-08 21:05:5063.42NoneNone-73.90482340.741776-73.878815...1.018420US-0.8660255.000000e-01-0.433884-0.900969
124668322016-01-15 08:27:412016-01-15 08:41:0513.99NoneNone-73.91148440.854698-73.881821...1.011548US0.866025-5.000000e-01-0.433884-0.900969
115226122016-01-09 04:35:212016-01-09 04:41:0210.98NoneNone-73.92177640.767071-73.933136...1.01954US0.8660255.000000e-01-0.974928-0.222521
..................................................................
99827312016-12-24 22:03:252016-12-24 22:17:1615.3074235NaNNaNNaN...1.01224522US-0.5000008.660254e-01-0.974928-0.222521
85720022016-12-03 20:33:532016-12-03 20:53:5114.8183258NaNNaNNaN...1.0123520US-0.8660255.000000e-01-0.974928-0.222521
60776822016-12-18 16:17:542016-12-18 16:33:1332.029556NaNNaNNaN...1.01218616US-0.866025-5.000000e-01-0.7818310.623490
7868722016-12-06 09:24:432016-12-06 09:41:0919.516611NaNNaNNaN...1.012619US0.707107-7.071068e-010.7818310.623490
14167222016-12-14 16:12:342016-12-14 16:15:1110.51255256NaNNaNNaN...1.01214216US-0.866025-5.000000e-010.974928-0.222521
\n", - "

24000 rows × 32 columns

\n", - "
" - ], - "text/plain": [ - " vendorID lpepPickupDatetime lpepDropoffDatetime passengerCount \\\n", - "1379860 2 2016-01-14 06:39:00 2016-01-14 06:44:55 1 \n", - "377548 2 2016-01-01 06:22:01 2016-01-01 06:27:14 5 \n", - "473976 2 2016-01-08 20:55:49 2016-01-08 21:05:50 6 \n", - "1246683 2 2016-01-15 08:27:41 2016-01-15 08:41:05 1 \n", - "1152261 2 2016-01-09 04:35:21 2016-01-09 04:41:02 1 \n", - "... ... ... ... ... \n", - "998273 1 2016-12-24 22:03:25 2016-12-24 22:17:16 1 \n", - "857200 2 2016-12-03 20:33:53 2016-12-03 20:53:51 1 \n", - "607768 2 2016-12-18 16:17:54 2016-12-18 16:33:13 3 \n", - "78687 2 2016-12-06 09:24:43 2016-12-06 09:41:09 1 \n", - "141672 2 2016-12-14 16:12:34 2016-12-14 16:15:11 1 \n", - "\n", - " tripDistance puLocationId doLocationId pickupLongitude \\\n", - "1379860 1.23 None None -73.911827 \n", - "377548 0.91 None None -73.962044 \n", - "473976 3.42 None None -73.904823 \n", - "1246683 3.99 None None -73.911484 \n", - "1152261 0.98 None None -73.921776 \n", - "... ... ... ... ... \n", - "998273 5.30 74 235 NaN \n", - "857200 4.81 83 258 NaN \n", - "607768 2.02 95 56 NaN \n", - "78687 9.51 66 11 NaN \n", - "141672 0.51 255 256 NaN \n", - "\n", - " pickupLatitude dropoffLongitude ... tripType month_num \\\n", - "1379860 40.775372 -73.899635 ... 1.0 1 \n", - "377548 40.709797 -73.946716 ... 1.0 1 \n", - "473976 40.741776 -73.878815 ... 1.0 1 \n", - "1246683 40.854698 -73.881821 ... 1.0 1 \n", - "1152261 40.767071 -73.933136 ... 1.0 1 \n", - "... ... ... ... ... ... \n", - "998273 NaN NaN ... 1.0 12 \n", - "857200 NaN NaN ... 1.0 12 \n", - "607768 NaN NaN ... 1.0 12 \n", - "78687 NaN NaN ... 1.0 12 \n", - "141672 NaN NaN ... 1.0 12 \n", - "\n", - " day_of_month day_of_week hour_of_day country_code hr_sin \\\n", - "1379860 14 3 6 US 1.000000 \n", - "377548 1 4 6 US 1.000000 \n", - "473976 8 4 20 US -0.866025 \n", - "1246683 15 4 8 US 0.866025 \n", - "1152261 9 5 4 US 0.866025 \n", - "... ... ... ... ... ... \n", - "998273 24 5 22 US -0.500000 \n", - "857200 3 5 20 US -0.866025 \n", - "607768 18 6 16 US -0.866025 \n", - "78687 6 1 9 US 0.707107 \n", - "141672 14 2 16 US -0.866025 \n", - "\n", - " hr_cos dy_sin dy_cos \n", - "1379860 6.123234e-17 0.433884 -0.900969 \n", - "377548 6.123234e-17 -0.433884 -0.900969 \n", - "473976 5.000000e-01 -0.433884 -0.900969 \n", - "1246683 -5.000000e-01 -0.433884 -0.900969 \n", - "1152261 5.000000e-01 -0.974928 -0.222521 \n", - "... ... ... ... \n", - "998273 8.660254e-01 -0.974928 -0.222521 \n", - "857200 5.000000e-01 -0.974928 -0.222521 \n", - "607768 -5.000000e-01 -0.781831 0.623490 \n", - "78687 -7.071068e-01 0.781831 0.623490 \n", - "141672 -5.000000e-01 0.974928 -0.222521 \n", - "\n", - "[24000 rows x 32 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def build_time_features(vector):\n", - " pickup_datetime = vector[0]\n", - " month_num = pickup_datetime.month\n", - " day_of_month = pickup_datetime.day\n", - " day_of_week = pickup_datetime.weekday()\n", - " hour_of_day = pickup_datetime.hour\n", - " country_code = \"US\"\n", - " hr_sin = np.sin(hour_of_day*(2.*np.pi/24))\n", - " hr_cos = np.cos(hour_of_day*(2.*np.pi/24))\n", - " dy_sin = np.sin(day_of_week*(2.*np.pi/7))\n", - " dy_cos = np.cos(day_of_week*(2.*np.pi/7))\n", - " \n", - " return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos))\n", - "\n", - "green_taxi_df[[\"month_num\", \"day_of_month\",\"day_of_week\", \"hour_of_day\", \"country_code\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]] = green_taxi_df[[\"lpepPickupDatetime\"]].apply(build_time_features, axis=1)\n", - "green_taxi_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Remove some of the columns that you won't need for modeling or additional feature building. Rename the time field for pickup time, and additionally convert the time to midnight using `pandas.Series.dt.normalize`. This is done to all time features so that the datetime column can be later used as a key when joining datasets together at a daily level of granularity." - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vendorIDlpepPickupDatetimepassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_numday_of_monthday_of_weekhour_of_daycountry_codehr_sinhr_cosdy_sindy_cosdatetime
137986022016-01-14 06:39:0011.23-73.91182740.775372-73.89963540.7683337.311436US1.0000006.123234e-170.433884-0.9009692016-01-14
37754822016-01-01 06:22:0150.91-73.96204440.709797-73.94671640.7069026.31146US1.0000006.123234e-17-0.433884-0.9009692016-01-01
47397622016-01-08 20:55:4963.42-73.90482340.741776-73.87881540.71762512.818420US-0.8660255.000000e-01-0.433884-0.9009692016-01-08
124668322016-01-15 08:27:4113.99-73.91148440.854698-73.88182140.88213015.811548US0.866025-5.000000e-01-0.433884-0.9009692016-01-15
115226122016-01-09 04:35:2110.98-73.92177640.767071-73.93313640.7745678.01954US0.8660255.000000e-01-0.974928-0.2225212016-01-09
\n", - "
" - ], - "text/plain": [ - " vendorID lpepPickupDatetime passengerCount tripDistance \\\n", - "1379860 2 2016-01-14 06:39:00 1 1.23 \n", - "377548 2 2016-01-01 06:22:01 5 0.91 \n", - "473976 2 2016-01-08 20:55:49 6 3.42 \n", - "1246683 2 2016-01-15 08:27:41 1 3.99 \n", - "1152261 2 2016-01-09 04:35:21 1 0.98 \n", - "\n", - " pickupLongitude pickupLatitude dropoffLongitude dropoffLatitude \\\n", - "1379860 -73.911827 40.775372 -73.899635 40.768333 \n", - "377548 -73.962044 40.709797 -73.946716 40.706902 \n", - "473976 -73.904823 40.741776 -73.878815 40.717625 \n", - "1246683 -73.911484 40.854698 -73.881821 40.882130 \n", - "1152261 -73.921776 40.767071 -73.933136 40.774567 \n", - "\n", - " totalAmount month_num day_of_month day_of_week hour_of_day \\\n", - "1379860 7.3 1 14 3 6 \n", - "377548 6.3 1 1 4 6 \n", - "473976 12.8 1 8 4 20 \n", - "1246683 15.8 1 15 4 8 \n", - "1152261 8.0 1 9 5 4 \n", - "\n", - " country_code hr_sin hr_cos dy_sin dy_cos datetime \n", - "1379860 US 1.000000 6.123234e-17 0.433884 -0.900969 2016-01-14 \n", - "377548 US 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n", - "473976 US -0.866025 5.000000e-01 -0.433884 -0.900969 2016-01-08 \n", - "1246683 US 0.866025 -5.000000e-01 -0.433884 -0.900969 2016-01-15 \n", - "1152261 US 0.866025 5.000000e-01 -0.974928 -0.222521 2016-01-09 " - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "columns_to_remove = [\"lpepDropoffDatetime\", \"puLocationId\", \"doLocationId\", \"extra\", \"mtaTax\",\n", - " \"improvementSurcharge\", \"tollsAmount\", \"ehailFee\", \"tripType\", \"rateCodeID\", \n", - " \"storeAndFwdFlag\", \"paymentType\", \"fareAmount\", \"tipAmount\"]\n", - "\n", - "green_taxi_df.drop(columns_to_remove, axis=1, inplace=True)\n", - "\n", - "green_taxi_df[\"datetime\"] = green_taxi_df[\"lpepPickupDatetime\"].dt.normalize()\n", - "green_taxi_df.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Enrich with Holiday Data\n", - "\n", - "Now that the taxi data is downloaded and roughly prepared, add in holiday data as additional features. Holiday-specific features will assist model accuracy, as major holidays are times where taxi demand increases dramatically and supply becomes limited. The holiday dataset is relatively small, so fetch the full set by using the `PublicHolidays` class constructor with no parameters for filtering. Preview the data to check the format." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpya4i60qp\\https%3A\\%2Fazureopendatastorage.azurefd.net\\holidaydatacontainer\\Processed\\part-00000-tid-8468414522853579044-35925ba8-a227-4b80-9c89-17065e7bf1db-649-c000.snappy.parquet\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
countryOrRegionholidayNamenormalizeHolidayNameisPaidTimeOffcountryRegionCodedate
19375ArgentinaAño Nuevo [New Year's Day]Año Nuevo [New Year's Day]NoneAR2008-01-01
19376AustraliaNew Year's DayNew Year's DayNoneAU2008-01-01
19377AustriaNeujahrNeujahrNoneAT2008-01-01
19378BelarusНовый годНовый годNoneBY2008-01-01
19379BelgiumNieuwjaarsdagNieuwjaarsdagNoneBE2008-01-01
\n", - "
" - ], - "text/plain": [ - " countryOrRegion holidayName normalizeHolidayName \\\n", - "19375 Argentina Año Nuevo [New Year's Day] Año Nuevo [New Year's Day] \n", - "19376 Australia New Year's Day New Year's Day \n", - "19377 Austria Neujahr Neujahr \n", - "19378 Belarus Новый год Новый год \n", - "19379 Belgium Nieuwjaarsdag Nieuwjaarsdag \n", - "\n", - " isPaidTimeOff countryRegionCode date \n", - "19375 None AR 2008-01-01 \n", - "19376 None AU 2008-01-01 \n", - "19377 None AT 2008-01-01 \n", - "19378 None BY 2008-01-01 \n", - "19379 None BE 2008-01-01 " - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from azureml.opendatasets import PublicHolidays\n", - "\n", - "# call default constructor to download full dataset\n", - "holidays_df = PublicHolidays().to_pandas_dataframe()\n", - "holidays_df.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Rename the `countryRegionCode` and `date` columns to match the respective field names from the taxi data, and also normalize the time so it can be used as a key. Next, join the holiday data with the taxi data by performing a left-join using the Pandas `merge()` function. This will preserve all records from `green_taxi_df`, but add in holiday data where it exists for the corresponding `datetime` and `country_code`, which in this case is always `\\\"US\\\"`. Preview the data to verify that they were merged correctly." - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vendorIDlpepPickupDatetimepassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_num...day_of_weekhour_of_daycountry_codehr_sinhr_cosdy_sindy_cosdatetimenormalizeHolidayNameisPaidTimeOff
122016-01-01 06:22:0150.91-73.96204440.709797-73.94671640.7069026.301...46US1.0000006.123234e-17-0.433884-0.9009692016-01-01New Year's DayTrue
2522016-01-01 06:14:4312.44-73.99357640.681519-73.99959640.65593010.301...46US1.0000006.123234e-17-0.433884-0.9009692016-01-01New Year's DayTrue
2722016-01-01 16:06:3314.57-73.96250940.687862-73.98136140.73275822.251...416US-0.866025-5.000000e-01-0.433884-0.9009692016-01-01New Year's DayTrue
4422016-01-18 11:46:27116.10-73.92552240.827877-73.93498240.68127850.301...011US0.258819-9.659258e-010.0000001.0000002016-01-18Martin Luther King Jr. DayNone
4522016-01-01 10:41:3913.33-73.96289140.711971-73.91806040.73683212.801...410US0.500000-8.660254e-01-0.433884-0.9009692016-01-01New Year's DayTrue
..................................................................
2386822016-12-25 00:21:2312.36NaNNaNNaNNaN12.3012...60US0.0000001.000000e+00-0.7818310.6234902016-12-25Christmas DayTrue
2389222016-12-25 14:05:4811.05NaNNaNNaNNaN12.3012...614US-0.500000-8.660254e-01-0.7818310.6234902016-12-25Christmas DayTrue
2394212016-12-26 01:43:5710.80NaNNaNNaNNaN7.5512...01US0.2588199.659258e-010.0000001.0000002016-12-26Christmas DayTrue
2397822016-12-26 03:38:3311.55NaNNaNNaNNaN8.3012...03US0.7071077.071068e-010.0000001.0000002016-12-26Christmas DayTrue
2398522016-12-26 22:12:1813.77NaNNaNNaNNaN16.2512...022US-0.5000008.660254e-010.0000001.0000002016-12-26Christmas DayTrue
\n", - "

673 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - " vendorID lpepPickupDatetime passengerCount tripDistance \\\n", - "1 2 2016-01-01 06:22:01 5 0.91 \n", - "25 2 2016-01-01 06:14:43 1 2.44 \n", - "27 2 2016-01-01 16:06:33 1 4.57 \n", - "44 2 2016-01-18 11:46:27 1 16.10 \n", - "45 2 2016-01-01 10:41:39 1 3.33 \n", - "... ... ... ... ... \n", - "23868 2 2016-12-25 00:21:23 1 2.36 \n", - "23892 2 2016-12-25 14:05:48 1 1.05 \n", - "23942 1 2016-12-26 01:43:57 1 0.80 \n", - "23978 2 2016-12-26 03:38:33 1 1.55 \n", - "23985 2 2016-12-26 22:12:18 1 3.77 \n", - "\n", - " pickupLongitude pickupLatitude dropoffLongitude dropoffLatitude \\\n", - "1 -73.962044 40.709797 -73.946716 40.706902 \n", - "25 -73.993576 40.681519 -73.999596 40.655930 \n", - "27 -73.962509 40.687862 -73.981361 40.732758 \n", - "44 -73.925522 40.827877 -73.934982 40.681278 \n", - "45 -73.962891 40.711971 -73.918060 40.736832 \n", - "... ... ... ... ... \n", - "23868 NaN NaN NaN NaN \n", - "23892 NaN NaN NaN NaN \n", - "23942 NaN NaN NaN NaN \n", - "23978 NaN NaN NaN NaN \n", - "23985 NaN NaN NaN NaN \n", - "\n", - " totalAmount month_num ... day_of_week hour_of_day country_code \\\n", - "1 6.30 1 ... 4 6 US \n", - "25 10.30 1 ... 4 6 US \n", - "27 22.25 1 ... 4 16 US \n", - "44 50.30 1 ... 0 11 US \n", - "45 12.80 1 ... 4 10 US \n", - "... ... ... ... ... ... ... \n", - "23868 12.30 12 ... 6 0 US \n", - "23892 12.30 12 ... 6 14 US \n", - "23942 7.55 12 ... 0 1 US \n", - "23978 8.30 12 ... 0 3 US \n", - "23985 16.25 12 ... 0 22 US \n", - "\n", - " hr_sin hr_cos dy_sin dy_cos datetime \\\n", - "1 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n", - "25 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n", - "27 -0.866025 -5.000000e-01 -0.433884 -0.900969 2016-01-01 \n", - "44 0.258819 -9.659258e-01 0.000000 1.000000 2016-01-18 \n", - "45 0.500000 -8.660254e-01 -0.433884 -0.900969 2016-01-01 \n", - "... ... ... ... ... ... \n", - "23868 0.000000 1.000000e+00 -0.781831 0.623490 2016-12-25 \n", - "23892 -0.500000 -8.660254e-01 -0.781831 0.623490 2016-12-25 \n", - "23942 0.258819 9.659258e-01 0.000000 1.000000 2016-12-26 \n", - "23978 0.707107 7.071068e-01 0.000000 1.000000 2016-12-26 \n", - "23985 -0.500000 8.660254e-01 0.000000 1.000000 2016-12-26 \n", - "\n", - " normalizeHolidayName isPaidTimeOff \n", - "1 New Year's Day True \n", - "25 New Year's Day True \n", - "27 New Year's Day True \n", - "44 Martin Luther King Jr. Day None \n", - "45 New Year's Day True \n", - "... ... ... \n", - "23868 Christmas Day True \n", - "23892 Christmas Day True \n", - "23942 Christmas Day True \n", - "23978 Christmas Day True \n", - "23985 Christmas Day True \n", - "\n", - "[673 rows x 21 columns]" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "holidays_df = holidays_df.rename(columns={\"countryRegionCode\": \"country_code\"})\n", - "holidays_df[\"datetime\"] = holidays_df[\"date\"].dt.normalize()\n", - "\n", - "holidays_df.drop([\"countryOrRegion\", \"holidayName\", \"date\"], axis=1, inplace=True)\n", - "\n", - "taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how=\"left\", on=[\"datetime\", \"country_code\"])\n", - "taxi_holidays_df[taxi_holidays_df[\"normalizeHolidayName\"].notnull()]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Enrich with weather data\n", - "\n", - "Now NOAA surface weather data can be appended to the taxi and holiday data. Use a similar approach to fetch the weather data by downloading one month at a time iteratively. Additionally, specify the `cols` parameter with an array of strings to filter the columns to download. This is a very large dataset containing weather surface data from all over the world, so before appending each month, filter the lat/long fields to near NYC using the `query()` function on the dataframe. This will ensure the `weather_df` doesn't get too large." - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-1.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-2.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-2.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-2.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-2.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-2.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-2.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-2.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-2.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-3.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-3.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-3.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-3.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-3.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-3.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-3.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-3.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-4.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-4.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-4.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-4.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-4.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-4.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-4.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-4.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-5.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-5.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-5.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-5.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-5.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-5.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-5.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-5.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-6.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-6.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-6.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-6.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-6.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-6.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-6.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-6.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-7.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-7.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-7.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-7.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-7.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-7.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-7.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-7.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-8.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-8.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-8.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-8.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-8.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-8.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-8.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-8.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-9.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-9.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-9.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-9.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-9.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-9.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-9.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-9.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-10.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-10.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-10.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-10.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-10.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-10.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-10.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-10.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-11.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-11.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-11.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-11.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-11.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-11.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-11.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-11.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-12.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-12.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-12.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-12.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-12.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-12.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-12.c000.snappy.parquet\n", - "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-12.c000.snappy.parquet\n" - ] - } - ], - "source": [ - "from azureml.opendatasets import NoaaIsdWeather\n", - "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n", - "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n", - "\n", - "weather_df = pd.concat([NoaaIsdWeather(cols=[\"temperature\", \"precipTime\", \"precipDepth\"], start_date=start + relativedelta(months=x), end_date=end + relativedelta(months=x))\\\n", - " .to_pandas_dataframe().query(\"latitude>=40.53 and latitude<=40.88 and longitude>=-74.09 and longitude<=-73.72 and temperature==temperature\") for x in range(12)])" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
wbanlatitudetemperatureusafdatetimelongitudeprecipDepthprecipTime
2046471473240.7832.87250302016-01-02 03:00:00-73.867NaNNaN
2046701473240.779-4.47250302016-01-22 13:51:00-73.8800.01.0
2046941473240.7795.07250302016-01-08 02:51:00-73.8800.01.0
2047011473240.779-1.17250302016-01-04 15:51:00-73.8800.01.0
2047151473240.7794.47250302016-01-01 21:51:00-73.8800.01.0
...........................
12484719472840.7894.47250532016-12-23 13:51:00-73.9670.01.0
12485559472840.7895.07250532016-12-12 13:51:00-73.9670.01.0
12485809472840.7893.97250532016-12-18 07:01:00-73.967NaNNaN
12485979472840.7897.87250532016-12-25 00:51:00-73.9670.01.0
12486009472840.789-2.87250532016-12-17 11:10:00-73.9675.01.0
\n", - "

55683 rows × 8 columns

\n", - "
" - ], - "text/plain": [ - " wban latitude temperature usaf datetime longitude \\\n", - "204647 14732 40.783 2.8 725030 2016-01-02 03:00:00 -73.867 \n", - "204670 14732 40.779 -4.4 725030 2016-01-22 13:51:00 -73.880 \n", - "204694 14732 40.779 5.0 725030 2016-01-08 02:51:00 -73.880 \n", - "204701 14732 40.779 -1.1 725030 2016-01-04 15:51:00 -73.880 \n", - "204715 14732 40.779 4.4 725030 2016-01-01 21:51:00 -73.880 \n", - "... ... ... ... ... ... ... \n", - "1248471 94728 40.789 4.4 725053 2016-12-23 13:51:00 -73.967 \n", - "1248555 94728 40.789 5.0 725053 2016-12-12 13:51:00 -73.967 \n", - "1248580 94728 40.789 3.9 725053 2016-12-18 07:01:00 -73.967 \n", - "1248597 94728 40.789 7.8 725053 2016-12-25 00:51:00 -73.967 \n", - "1248600 94728 40.789 -2.8 725053 2016-12-17 11:10:00 -73.967 \n", - "\n", - " precipDepth precipTime \n", - "204647 NaN NaN \n", - "204670 0.0 1.0 \n", - "204694 0.0 1.0 \n", - "204701 0.0 1.0 \n", - "204715 0.0 1.0 \n", - "... ... ... \n", - "1248471 0.0 1.0 \n", - "1248555 0.0 1.0 \n", - "1248580 NaN NaN \n", - "1248597 0.0 1.0 \n", - "1248600 5.0 1.0 \n", - "\n", - "[55683 rows x 8 columns]" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "weather_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Again call `pandas.Series.dt.normalize` on the `datetime` field in the weather data so it matches the time key in `taxi_holidays_df`.\n", - "\n", - "\n", - "Next group the weather data to have daily aggregated weather values. Define a dict `aggregations` to define how to aggregate each field at a daily level. For`temperature` take the mean and for `precipTime` and `precipDepth` take the daily maximum. Use the `groupby()` function along with the aggregations to group the data. Preview the data to ensure there is one record per day." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
precipTimetemperatureprecipDepth
datetime
2016-01-011.05.1973450.0
2016-01-021.02.5678570.0
2016-01-031.03.8464290.0
2016-01-041.00.1238940.0
2016-01-056.0-7.2062500.0
2016-01-066.0-0.8963960.0
2016-01-076.03.1806450.0
2016-01-081.04.3840910.0
2016-01-096.06.7102743.0
2016-01-1024.010.943655254.0
\n", - "
" - ], - "text/plain": [ - " precipTime temperature precipDepth\n", - "datetime \n", - "2016-01-01 1.0 5.197345 0.0\n", - "2016-01-02 1.0 2.567857 0.0\n", - "2016-01-03 1.0 3.846429 0.0\n", - "2016-01-04 1.0 0.123894 0.0\n", - "2016-01-05 6.0 -7.206250 0.0\n", - "2016-01-06 6.0 -0.896396 0.0\n", - "2016-01-07 6.0 3.180645 0.0\n", - "2016-01-08 1.0 4.384091 0.0\n", - "2016-01-09 6.0 6.710274 3.0\n", - "2016-01-10 24.0 10.943655 254.0" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "weather_df[\"datetime\"] = weather_df[\"datetime\"].dt.normalize()\n", - "\n", - "# group by datetime\n", - "aggregations = {\"precipTime\": \"max\", \"temperature\": \"mean\", \"precipDepth\": \"max\"}\n", - "weather_df_grouped = weather_df.groupby(\"datetime\").agg(aggregations)\n", - "weather_df_grouped.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: The examples in this tutorial merge data using Pandas functions and custom aggregations, but the Open Datasets SDK has classes designed to easily merge and enrich data sets. See the [notebook](https://github.com/Azure/OpenDatasetsNotebooks/blob/master/tutorials/data-join/04-nyc-taxi-join-weather-in-pandas.ipynb) for code examples of these design patterns." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Cleanse data\n", - "\n", - "Merge the existing taxi and holiday data with the new weather data. This time `datetime` is the only key, and again perform a left-join of the data. Run the `describe()` function on the new dataframe to see summary statistics for each field." - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vendorIDpassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_numday_of_monthday_of_weekhour_of_dayhr_sinhr_cosdy_sindy_cosprecipTimetemperatureprecipDepth
count24000.00000024000.00000024000.00000012000.00000012000.00000012000.00000012000.00000024000.00000024000.00000024000.00000024000.00000024000.00000024000.0000002.400000e+0424000.00000024000.00000024000.00000024000.00000024000.000000
mean1.7896671.3552922.830398-73.81439340.678791-73.83701940.69072914.6682516.50000015.0687503.24779213.582875-0.239687-1.510585e-02-0.079292-0.05963013.31866713.8782721037.956292
std0.4075541.0200183.1183023.0163851.6631522.6986091.48803211.7385323.4521248.4775551.9512096.7083720.6675287.048175e-010.7144570.69264010.3331629.4844432788.844868
min1.0000000.0000000.000000-74.1648250.000000-75.1864400.000000-200.0000001.0000001.0000000.0000000.000000-1.000000-1.000000e+00-0.974928-0.9009691.000000-13.3794640.000000
25%2.0000001.0000001.040000-73.96137040.693539-73.96751440.6951287.8800003.7500008.0000002.0000009.000000-0.866025-7.071068e-01-0.781831-0.9009696.0000006.6207730.000000
50%2.0000001.0000001.840000-73.94713240.745928-73.94586940.74591411.3000006.50000015.0000003.00000015.000000-0.500000-1.836970e-160.000000-0.2225216.00000013.10832310.000000
75%2.0000001.0000003.500000-73.91963840.802049-73.91305940.79107617.7500009.25000022.0000005.00000019.0000000.2588197.071068e-010.7818310.62349024.00000022.944737127.000000
max2.0000007.000000106.6800000.00000041.0810470.00000041.081055450.00000012.00000030.0000006.00000023.0000001.0000001.000000e+000.9749281.00000024.00000031.3036659999.000000
\n", - "
" - ], - "text/plain": [ - " vendorID passengerCount tripDistance pickupLongitude \\\n", - "count 24000.000000 24000.000000 24000.000000 12000.000000 \n", - "mean 1.789667 1.355292 2.830398 -73.814393 \n", - "std 0.407554 1.020018 3.118302 3.016385 \n", - "min 1.000000 0.000000 0.000000 -74.164825 \n", - "25% 2.000000 1.000000 1.040000 -73.961370 \n", - "50% 2.000000 1.000000 1.840000 -73.947132 \n", - "75% 2.000000 1.000000 3.500000 -73.919638 \n", - "max 2.000000 7.000000 106.680000 0.000000 \n", - "\n", - " pickupLatitude dropoffLongitude dropoffLatitude totalAmount \\\n", - "count 12000.000000 12000.000000 12000.000000 24000.000000 \n", - "mean 40.678791 -73.837019 40.690729 14.668251 \n", - "std 1.663152 2.698609 1.488032 11.738532 \n", - "min 0.000000 -75.186440 0.000000 -200.000000 \n", - "25% 40.693539 -73.967514 40.695128 7.880000 \n", - "50% 40.745928 -73.945869 40.745914 11.300000 \n", - "75% 40.802049 -73.913059 40.791076 17.750000 \n", - "max 41.081047 0.000000 41.081055 450.000000 \n", - "\n", - " month_num day_of_month day_of_week hour_of_day hr_sin \\\n", - "count 24000.000000 24000.000000 24000.000000 24000.000000 24000.000000 \n", - "mean 6.500000 15.068750 3.247792 13.582875 -0.239687 \n", - "std 3.452124 8.477555 1.951209 6.708372 0.667528 \n", - "min 1.000000 1.000000 0.000000 0.000000 -1.000000 \n", - "25% 3.750000 8.000000 2.000000 9.000000 -0.866025 \n", - "50% 6.500000 15.000000 3.000000 15.000000 -0.500000 \n", - "75% 9.250000 22.000000 5.000000 19.000000 0.258819 \n", - "max 12.000000 30.000000 6.000000 23.000000 1.000000 \n", - "\n", - " hr_cos dy_sin dy_cos precipTime temperature \\\n", - "count 2.400000e+04 24000.000000 24000.000000 24000.000000 24000.000000 \n", - "mean -1.510585e-02 -0.079292 -0.059630 13.318667 13.878272 \n", - "std 7.048175e-01 0.714457 0.692640 10.333162 9.484443 \n", - "min -1.000000e+00 -0.974928 -0.900969 1.000000 -13.379464 \n", - "25% -7.071068e-01 -0.781831 -0.900969 6.000000 6.620773 \n", - "50% -1.836970e-16 0.000000 -0.222521 6.000000 13.108323 \n", - "75% 7.071068e-01 0.781831 0.623490 24.000000 22.944737 \n", - "max 1.000000e+00 0.974928 1.000000 24.000000 31.303665 \n", - "\n", - " precipDepth \n", - "count 24000.000000 \n", - "mean 1037.956292 \n", - "std 2788.844868 \n", - "min 0.000000 \n", - "25% 0.000000 \n", - "50% 10.000000 \n", - "75% 127.000000 \n", - "max 9999.000000 " - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how=\"left\", on=[\"datetime\"])\n", - "taxi_holidays_weather_df.describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the summary statistics, you see that there are several fields that have outliers or values that will reduce model accuracy. First filter the lat/long fields to be within the same bounds you used for filtering weather data. The `tripDistance` field has some bad data, because the minimum value is negative. The `passengerCount` field has bad data as well, with the max value being 210 passengers. Lastly, the `totalAmount` field has negative values, which don't make sense in the context of our model.\n", - "\n", - "Filter out these anomolies using query functions, and then remove the last few columns unnecesary for training.\n", - "\n", - "Note: since a random sample of 2000 was taken for each month of the taxi data, the statistics may vary each time this is ran." - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "final_df = taxi_holidays_weather_df.query(\"pickupLatitude>=40.53 and pickupLatitude<=40.88 and \\\n", - " pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \\\n", - " tripDistance>0 and tripDistance<75 and \\\n", - " passengerCount>0 and passengerCount<100 and \\\n", - " totalAmount>0\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Call `describe()` again on the data to ensure cleansing worked as expected. The final data is prepared and cleansed, consisting of taxi, holiday, and weather data, and is ready to use for machine learning model training." - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vendorIDpassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_numday_of_monthday_of_weekhour_of_dayhr_sinhr_cosdy_sindy_cosprecipTimetemperatureprecipDepth
count11763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.0000001.176300e+0411763.00000011763.00000011763.00000011763.00000011763.000000
mean1.7901901.3692942.841407-73.93791140.746224-73.91090140.73081814.5579173.50131814.9292703.25231713.538553-0.236544-2.265927e-03-0.070226-0.05905911.99396410.288261192.179546
std0.4071911.0416342.8298640.0411210.0568181.3641140.7534689.9891651.7073508.4757931.9481276.7780120.6688127.048492e-010.7188710.68912210.1147758.5300111223.101074
min1.0000001.0000000.010000-74.03519440.572906-74.1830290.0000000.0100001.0000001.0000000.0000000.000000-1.000000-1.000000e+00-0.974928-0.9009691.000000-13.3794640.000000
25%2.0000001.0000001.090000-73.96160140.693594-73.96779340.6954408.1600002.0000008.0000002.0000009.000000-0.866025-7.071068e-01-0.781831-0.9009691.0000003.5045800.000000
50%2.0000001.0000001.900000-73.94751740.745842-73.94624340.74578911.3000004.00000015.0000003.00000015.000000-0.500000-1.836970e-160.000000-0.2225216.00000010.4682763.000000
75%2.0000001.0000003.530000-73.92050940.801752-73.91380740.78994217.3800005.00000022.0000005.00000019.0000000.2588197.071068e-010.7818310.62349024.00000016.96692341.000000
max2.0000006.00000038.850000-73.73889940.8799820.00000041.073185123.8000006.00000030.0000006.00000023.0000001.0000001.000000e+000.9749281.00000024.00000026.5241079999.000000
\n", - "
" - ], - "text/plain": [ - " vendorID passengerCount tripDistance pickupLongitude \\\n", - "count 11763.000000 11763.000000 11763.000000 11763.000000 \n", - "mean 1.790190 1.369294 2.841407 -73.937911 \n", - "std 0.407191 1.041634 2.829864 0.041121 \n", - "min 1.000000 1.000000 0.010000 -74.035194 \n", - "25% 2.000000 1.000000 1.090000 -73.961601 \n", - "50% 2.000000 1.000000 1.900000 -73.947517 \n", - "75% 2.000000 1.000000 3.530000 -73.920509 \n", - "max 2.000000 6.000000 38.850000 -73.738899 \n", - "\n", - " pickupLatitude dropoffLongitude dropoffLatitude totalAmount \\\n", - "count 11763.000000 11763.000000 11763.000000 11763.000000 \n", - "mean 40.746224 -73.910901 40.730818 14.557917 \n", - "std 0.056818 1.364114 0.753468 9.989165 \n", - "min 40.572906 -74.183029 0.000000 0.010000 \n", - "25% 40.693594 -73.967793 40.695440 8.160000 \n", - "50% 40.745842 -73.946243 40.745789 11.300000 \n", - "75% 40.801752 -73.913807 40.789942 17.380000 \n", - "max 40.879982 0.000000 41.073185 123.800000 \n", - "\n", - " month_num day_of_month day_of_week hour_of_day hr_sin \\\n", - "count 11763.000000 11763.000000 11763.000000 11763.000000 11763.000000 \n", - "mean 3.501318 14.929270 3.252317 13.538553 -0.236544 \n", - "std 1.707350 8.475793 1.948127 6.778012 0.668812 \n", - "min 1.000000 1.000000 0.000000 0.000000 -1.000000 \n", - "25% 2.000000 8.000000 2.000000 9.000000 -0.866025 \n", - "50% 4.000000 15.000000 3.000000 15.000000 -0.500000 \n", - "75% 5.000000 22.000000 5.000000 19.000000 0.258819 \n", - "max 6.000000 30.000000 6.000000 23.000000 1.000000 \n", - "\n", - " hr_cos dy_sin dy_cos precipTime temperature \\\n", - "count 1.176300e+04 11763.000000 11763.000000 11763.000000 11763.000000 \n", - "mean -2.265927e-03 -0.070226 -0.059059 11.993964 10.288261 \n", - "std 7.048492e-01 0.718871 0.689122 10.114775 8.530011 \n", - "min -1.000000e+00 -0.974928 -0.900969 1.000000 -13.379464 \n", - "25% -7.071068e-01 -0.781831 -0.900969 1.000000 3.504580 \n", - "50% -1.836970e-16 0.000000 -0.222521 6.000000 10.468276 \n", - "75% 7.071068e-01 0.781831 0.623490 24.000000 16.966923 \n", - "max 1.000000e+00 0.974928 1.000000 24.000000 26.524107 \n", - "\n", - " precipDepth \n", - "count 11763.000000 \n", - "mean 192.179546 \n", - "std 1223.101074 \n", - "min 0.000000 \n", - "25% 0.000000 \n", - "50% 3.000000 \n", - "75% 41.000000 \n", - "max 9999.000000 " - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "final_df.describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train a model\n", - "\n", - "The data is ready to train a machine learning model." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.linear_model import LinearRegression\n", - "from sklearn.linear_model import RidgeCV\n", - "from sklearn.linear_model import Ridge\n", - "from sklearn.ensemble import RandomForestRegressor\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.preprocessing import OneHotEncoder\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Training Function\n", - "\n", - "Define a function that can be used to create a model pipeline that can be trained and then used for scoring. This pipeline has 2 steps: preprocessing and model training.\n", - "\n", - "Preprocessing Stages:\n", - "The preprocessing step of the pipeline also has 2 stages, one for numerical features and one for categorical features.\n", - "For the numerical features, let's fill in any blanks with 0's. While the training data may not have any nulls in the these fields, future data that is scored may and this step will take care of those for us. Optionally, a scaler transformation could be added in this step as well. Similarly for the categorical variables, let's have the null values filled with \"MISSING\". Additionally to the categorical variables, these will need to be one hot encoded, so we will include that step in our pipeline.\n", - "\n", - "Model Training Stage:\n", - "An input parameter will determine which type of model of train. Let's test out a linear regression and random forest model to start. \n", - "\n", - "The two steps are put together into the pipeline which is what the function is returning." - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [], - "source": [ - "def createClassModel(algo_name, catg, nums):\n", - " numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])\n", - " \n", - " categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=\"MISSING\")), ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n", - " \n", - " preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)])\n", - " \n", - " if algo_name == 'linear_regression':\n", - " model=Ridge(alpha=100)\n", - " elif algo_name == 'random_forest':\n", - " model = RandomForestRegressor()\n", - " else:\n", - " pass\n", - " ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), (\"model\", model)])\n", - " return ModelPipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's define the arguments that will be passed to the function. `catg_cols` is a list of the categorical variables that will be transformed in our processing step. `num_cols` is a list of the numerical variables that will be transformed in our processing step. Let's define the target column as `label` so it can be used in future steps as well." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "catg_cols = [\"vendorID\", \"month_num\", \"day_of_month\", \"normalizeHolidayName\", \"isPaidTimeOff\"]\n", - "num_cols = [\"passengerCount\", \"tripDistance\", \"precipTime\", \"temperature\", \"precipDepth\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]\n", - "label = [\"totalAmount\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The training is ready to begin, but first, let's make sure that the categorical variables are strings in our dataframe to ensure no errors in our pipeline. \n", - "\n", - "Next, the data is split into training and test sets by using the `train_test_split()` function in the `scikit-learn` library. The `test_size` parameter determines the percentage of data to allocate to testing. The `random_state` parameter sets a seed to the random number generator, so that your train-test splits are deterministic.\n", - "\n", - "The training will happen in the for loop so that both algorithms can be tested. The createClassModel funtion is called to retreive the pipeline that can then be trained using the training dataset. \n", - "\n", - "Once trained, the test dataset is then ran through the model to test the model's performance. Using various functions from sklearn.metrics, the R2 score, MAPE, and RMSE can be used to measure model performance." - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "linear_regression\n", - "R2: 0.8034971051723139\n", - "MAPE: 0.15888983234876766\n", - "RMSE: 4.606544019524053\n", - "\n", - "random_forest\n", - "R2: 0.8073017231520601\n", - "MAPE: 0.14715914748857337\n", - "RMSE: 4.5617309259357475\n", - "\n" - ] - } - ], - "source": [ - "# make sure categorical columns are strings\n", - "final_df[catg_cols] = final_df[catg_cols].astype(\"str\")\n", - "\n", - "# split data\n", - "X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222)\n", - "\n", - "# test 2 algorithms\n", - "for algorithmname in [\"linear_regression\", 'random_forest']:\n", - " fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline\n", - " fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine\n", - "\n", - " y_pred = fitPipeline.predict(X_test) # score with fitted pipeline\n", - "\n", - " # Evaluate\n", - " r2 = r2_score(y_test, y_pred)\n", - " mape = mean_absolute_percentage_error(y_test, y_pred)\n", - " rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n", - "\n", - " print(algorithmname)\n", - " print(\"R2:\", r2)\n", - " print(\"MAPE:\", mape)\n", - " print(\"RMSE:\", rmse)\n", - " print()" - ] - } - ], - "metadata": { - "interpreter": { - "hash": "74e9702761b8f12846716a18132904990016d49f378e22e0e13a0e91318de754" - }, - "kernelspec": { - "display_name": "Python 3.8.12 ('mlopsenv')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial: Build a regression model with Open Datasets\n", + "\n", + "In this tutorial, you leverage the convenience of Azure Open Datasets to create a regression model to predict NYC taxi fare prices. Easily download publicly available taxi, holiday and weather data to create a dataset that can train a regression model using sklearn." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.opendatasets import NycTlcGreen\n", + "import pandas as pd\n", + "import numpy as np\n", + "from datetime import datetime\n", + "from dateutil.relativedelta import relativedelta\n", + "\n", + "pd.options.mode.chained_assignment = None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download Data\n", + "Begin by downloading the NYC Taxi dataset from Azure Open Datasets. In non-Spark environments, Open Datasets only allows one month of data at a time with certain classes to avoid MemoryError with large datasets. To download 1 year of taxi data, we will fetch 2000 random samples from each month.\n", + "\n", + "Note: Open Datasets has mirroring classes for working in Spark where data size and memory are not a concern." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpviwf6gni\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=1\\part-00119-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2689-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp6e1co7l5\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=2\\part-00060-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2630-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpd5lgxojh\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=3\\part-00196-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2766-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpela340gr\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=4\\part-00121-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2691-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpe79pzv2_\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=5\\part-00044-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2614-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpyxyv_8h4\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=6\\part-00108-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2678-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp498a1aem\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=7\\part-00020-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2590-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpuhi_se7a\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=8\\part-00172-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2742-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpd7id7xon\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=9\\part-00076-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2646-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp3he0z_qe\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=10\\part-00090-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2660-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp1sa8wuxl\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=11\\part-00021-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2591-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp1e7uekhr\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=12\\part-00116-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2686-1.c000.snappy.parquet\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDlpepPickupDatetimelpepDropoffDatetimepassengerCounttripDistancepuLocationIddoLocationIdpickupLongitudepickupLatitudedropoffLongitude...paymentTypefareAmountextramtaTaximprovementSurchargetipAmounttollsAmountehailFeetotalAmounttripType
137986022016-01-14 06:39:002016-01-14 06:44:5511.23NoneNone-73.91182740.775372-73.899635...26.50.00.50.30.000.0NaN7.301.0
37754822016-01-01 06:22:012016-01-01 06:27:1450.91NoneNone-73.96204440.709797-73.946716...25.50.00.50.30.000.0NaN6.301.0
47397622016-01-08 20:55:492016-01-08 21:05:5063.42NoneNone-73.90482340.741776-73.878815...211.50.50.50.30.000.0NaN12.801.0
124668322016-01-15 08:27:412016-01-15 08:41:0513.99NoneNone-73.91148440.854698-73.881821...215.00.00.50.30.000.0NaN15.801.0
115226122016-01-09 04:35:212016-01-09 04:41:0210.98NoneNone-73.92177640.767071-73.933136...16.00.50.50.30.700.0NaN8.001.0
..................................................................
99827312016-12-24 22:03:252016-12-24 22:17:1615.3074235NaNNaNNaN...216.50.50.50.30.000.0NaN17.801.0
85720022016-12-03 20:33:532016-12-03 20:53:5114.8183258NaNNaNNaN...118.50.50.50.33.000.0NaN22.801.0
60776822016-12-18 16:17:542016-12-18 16:33:1332.029556NaNNaNNaN...211.50.00.50.30.000.0NaN12.301.0
7868722016-12-06 09:24:432016-12-06 09:41:0919.516611NaNNaNNaN...227.50.00.50.30.000.0NaN28.301.0
14167222016-12-14 16:12:342016-12-14 16:15:1110.51255256NaNNaNNaN...14.01.00.50.31.450.0NaN7.251.0
\n", + "

24000 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " vendorID lpepPickupDatetime lpepDropoffDatetime passengerCount \\\n", + "1379860 2 2016-01-14 06:39:00 2016-01-14 06:44:55 1 \n", + "377548 2 2016-01-01 06:22:01 2016-01-01 06:27:14 5 \n", + "473976 2 2016-01-08 20:55:49 2016-01-08 21:05:50 6 \n", + "1246683 2 2016-01-15 08:27:41 2016-01-15 08:41:05 1 \n", + "1152261 2 2016-01-09 04:35:21 2016-01-09 04:41:02 1 \n", + "... ... ... ... ... \n", + "998273 1 2016-12-24 22:03:25 2016-12-24 22:17:16 1 \n", + "857200 2 2016-12-03 20:33:53 2016-12-03 20:53:51 1 \n", + "607768 2 2016-12-18 16:17:54 2016-12-18 16:33:13 3 \n", + "78687 2 2016-12-06 09:24:43 2016-12-06 09:41:09 1 \n", + "141672 2 2016-12-14 16:12:34 2016-12-14 16:15:11 1 \n", + "\n", + " tripDistance puLocationId doLocationId pickupLongitude \\\n", + "1379860 1.23 None None -73.911827 \n", + "377548 0.91 None None -73.962044 \n", + "473976 3.42 None None -73.904823 \n", + "1246683 3.99 None None -73.911484 \n", + "1152261 0.98 None None -73.921776 \n", + "... ... ... ... ... \n", + "998273 5.30 74 235 NaN \n", + "857200 4.81 83 258 NaN \n", + "607768 2.02 95 56 NaN \n", + "78687 9.51 66 11 NaN \n", + "141672 0.51 255 256 NaN \n", + "\n", + " pickupLatitude dropoffLongitude ... paymentType fareAmount extra \\\n", + "1379860 40.775372 -73.899635 ... 2 6.5 0.0 \n", + "377548 40.709797 -73.946716 ... 2 5.5 0.0 \n", + "473976 40.741776 -73.878815 ... 2 11.5 0.5 \n", + "1246683 40.854698 -73.881821 ... 2 15.0 0.0 \n", + "1152261 40.767071 -73.933136 ... 1 6.0 0.5 \n", + "... ... ... ... ... ... ... \n", + "998273 NaN NaN ... 2 16.5 0.5 \n", + "857200 NaN NaN ... 1 18.5 0.5 \n", + "607768 NaN NaN ... 2 11.5 0.0 \n", + "78687 NaN NaN ... 2 27.5 0.0 \n", + "141672 NaN NaN ... 1 4.0 1.0 \n", + "\n", + " mtaTax improvementSurcharge tipAmount tollsAmount ehailFee \\\n", + "1379860 0.5 0.3 0.00 0.0 NaN \n", + "377548 0.5 0.3 0.00 0.0 NaN \n", + "473976 0.5 0.3 0.00 0.0 NaN \n", + "1246683 0.5 0.3 0.00 0.0 NaN \n", + "1152261 0.5 0.3 0.70 0.0 NaN \n", + "... ... ... ... ... ... \n", + "998273 0.5 0.3 0.00 0.0 NaN \n", + "857200 0.5 0.3 3.00 0.0 NaN \n", + "607768 0.5 0.3 0.00 0.0 NaN \n", + "78687 0.5 0.3 0.00 0.0 NaN \n", + "141672 0.5 0.3 1.45 0.0 NaN \n", + "\n", + " totalAmount tripType \n", + "1379860 7.30 1.0 \n", + "377548 6.30 1.0 \n", + "473976 12.80 1.0 \n", + "1246683 15.80 1.0 \n", + "1152261 8.00 1.0 \n", + "... ... ... \n", + "998273 17.80 1.0 \n", + "857200 22.80 1.0 \n", + "607768 12.30 1.0 \n", + "78687 28.30 1.0 \n", + "141672 7.25 1.0 \n", + "\n", + "[24000 rows x 23 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n", + "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n", + "\n", + "green_taxi_df = pd.concat([NycTlcGreen(start + relativedelta(months=x), end + relativedelta(months=x)) \\\n", + " .to_pandas_dataframe().sample(2000) for x in range(12)])\n", + "green_taxi_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that the initial data is loaded, define a function to create various time-based features from the pickup datetime field. This will create new fields for the month number, day of month, day of week, and hour of day. From those, we calculate the sin and cosine transformations to capture the cyclical nature of the variable which will allow the model to factor in time-based seasonality. This function also adds a static feature for the country code to join the holiday data. Use the apply() function on the dataframe to interatively apply this function to each row in the dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDlpepPickupDatetimelpepDropoffDatetimepassengerCounttripDistancepuLocationIddoLocationIdpickupLongitudepickupLatitudedropoffLongitude...tripTypemonth_numday_of_monthday_of_weekhour_of_daycountry_codehr_sinhr_cosdy_sindy_cos
137986022016-01-14 06:39:002016-01-14 06:44:5511.23NoneNone-73.91182740.775372-73.899635...1.011436US1.0000006.123234e-170.433884-0.900969
37754822016-01-01 06:22:012016-01-01 06:27:1450.91NoneNone-73.96204440.709797-73.946716...1.01146US1.0000006.123234e-17-0.433884-0.900969
47397622016-01-08 20:55:492016-01-08 21:05:5063.42NoneNone-73.90482340.741776-73.878815...1.018420US-0.8660255.000000e-01-0.433884-0.900969
124668322016-01-15 08:27:412016-01-15 08:41:0513.99NoneNone-73.91148440.854698-73.881821...1.011548US0.866025-5.000000e-01-0.433884-0.900969
115226122016-01-09 04:35:212016-01-09 04:41:0210.98NoneNone-73.92177640.767071-73.933136...1.01954US0.8660255.000000e-01-0.974928-0.222521
..................................................................
99827312016-12-24 22:03:252016-12-24 22:17:1615.3074235NaNNaNNaN...1.01224522US-0.5000008.660254e-01-0.974928-0.222521
85720022016-12-03 20:33:532016-12-03 20:53:5114.8183258NaNNaNNaN...1.0123520US-0.8660255.000000e-01-0.974928-0.222521
60776822016-12-18 16:17:542016-12-18 16:33:1332.029556NaNNaNNaN...1.01218616US-0.866025-5.000000e-01-0.7818310.623490
7868722016-12-06 09:24:432016-12-06 09:41:0919.516611NaNNaNNaN...1.012619US0.707107-7.071068e-010.7818310.623490
14167222016-12-14 16:12:342016-12-14 16:15:1110.51255256NaNNaNNaN...1.01214216US-0.866025-5.000000e-010.974928-0.222521
\n", + "

24000 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " vendorID lpepPickupDatetime lpepDropoffDatetime passengerCount \\\n", + "1379860 2 2016-01-14 06:39:00 2016-01-14 06:44:55 1 \n", + "377548 2 2016-01-01 06:22:01 2016-01-01 06:27:14 5 \n", + "473976 2 2016-01-08 20:55:49 2016-01-08 21:05:50 6 \n", + "1246683 2 2016-01-15 08:27:41 2016-01-15 08:41:05 1 \n", + "1152261 2 2016-01-09 04:35:21 2016-01-09 04:41:02 1 \n", + "... ... ... ... ... \n", + "998273 1 2016-12-24 22:03:25 2016-12-24 22:17:16 1 \n", + "857200 2 2016-12-03 20:33:53 2016-12-03 20:53:51 1 \n", + "607768 2 2016-12-18 16:17:54 2016-12-18 16:33:13 3 \n", + "78687 2 2016-12-06 09:24:43 2016-12-06 09:41:09 1 \n", + "141672 2 2016-12-14 16:12:34 2016-12-14 16:15:11 1 \n", + "\n", + " tripDistance puLocationId doLocationId pickupLongitude \\\n", + "1379860 1.23 None None -73.911827 \n", + "377548 0.91 None None -73.962044 \n", + "473976 3.42 None None -73.904823 \n", + "1246683 3.99 None None -73.911484 \n", + "1152261 0.98 None None -73.921776 \n", + "... ... ... ... ... \n", + "998273 5.30 74 235 NaN \n", + "857200 4.81 83 258 NaN \n", + "607768 2.02 95 56 NaN \n", + "78687 9.51 66 11 NaN \n", + "141672 0.51 255 256 NaN \n", + "\n", + " pickupLatitude dropoffLongitude ... tripType month_num \\\n", + "1379860 40.775372 -73.899635 ... 1.0 1 \n", + "377548 40.709797 -73.946716 ... 1.0 1 \n", + "473976 40.741776 -73.878815 ... 1.0 1 \n", + "1246683 40.854698 -73.881821 ... 1.0 1 \n", + "1152261 40.767071 -73.933136 ... 1.0 1 \n", + "... ... ... ... ... ... \n", + "998273 NaN NaN ... 1.0 12 \n", + "857200 NaN NaN ... 1.0 12 \n", + "607768 NaN NaN ... 1.0 12 \n", + "78687 NaN NaN ... 1.0 12 \n", + "141672 NaN NaN ... 1.0 12 \n", + "\n", + " day_of_month day_of_week hour_of_day country_code hr_sin \\\n", + "1379860 14 3 6 US 1.000000 \n", + "377548 1 4 6 US 1.000000 \n", + "473976 8 4 20 US -0.866025 \n", + "1246683 15 4 8 US 0.866025 \n", + "1152261 9 5 4 US 0.866025 \n", + "... ... ... ... ... ... \n", + "998273 24 5 22 US -0.500000 \n", + "857200 3 5 20 US -0.866025 \n", + "607768 18 6 16 US -0.866025 \n", + "78687 6 1 9 US 0.707107 \n", + "141672 14 2 16 US -0.866025 \n", + "\n", + " hr_cos dy_sin dy_cos \n", + "1379860 6.123234e-17 0.433884 -0.900969 \n", + "377548 6.123234e-17 -0.433884 -0.900969 \n", + "473976 5.000000e-01 -0.433884 -0.900969 \n", + "1246683 -5.000000e-01 -0.433884 -0.900969 \n", + "1152261 5.000000e-01 -0.974928 -0.222521 \n", + "... ... ... ... \n", + "998273 8.660254e-01 -0.974928 -0.222521 \n", + "857200 5.000000e-01 -0.974928 -0.222521 \n", + "607768 -5.000000e-01 -0.781831 0.623490 \n", + "78687 -7.071068e-01 0.781831 0.623490 \n", + "141672 -5.000000e-01 0.974928 -0.222521 \n", + "\n", + "[24000 rows x 32 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def build_time_features(vector):\n", + " pickup_datetime = vector[0]\n", + " month_num = pickup_datetime.month\n", + " day_of_month = pickup_datetime.day\n", + " day_of_week = pickup_datetime.weekday()\n", + " hour_of_day = pickup_datetime.hour\n", + " country_code = \"US\"\n", + " hr_sin = np.sin(hour_of_day*(2.*np.pi/24))\n", + " hr_cos = np.cos(hour_of_day*(2.*np.pi/24))\n", + " dy_sin = np.sin(day_of_week*(2.*np.pi/7))\n", + " dy_cos = np.cos(day_of_week*(2.*np.pi/7))\n", + " \n", + " return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos))\n", + "\n", + "green_taxi_df[[\"month_num\", \"day_of_month\",\"day_of_week\", \"hour_of_day\", \"country_code\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]] = green_taxi_df[[\"lpepPickupDatetime\"]].apply(build_time_features, axis=1)\n", + "green_taxi_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove some of the columns that you won't need for modeling or additional feature building. Rename the time field for pickup time, and additionally convert the time to midnight using `pandas.Series.dt.normalize`. This is done to all time features so that the datetime column can be later used as a key when joining datasets together at a daily level of granularity." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDlpepPickupDatetimepassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_numday_of_monthday_of_weekhour_of_daycountry_codehr_sinhr_cosdy_sindy_cosdatetime
137986022016-01-14 06:39:0011.23-73.91182740.775372-73.89963540.7683337.311436US1.0000006.123234e-170.433884-0.9009692016-01-14
37754822016-01-01 06:22:0150.91-73.96204440.709797-73.94671640.7069026.31146US1.0000006.123234e-17-0.433884-0.9009692016-01-01
47397622016-01-08 20:55:4963.42-73.90482340.741776-73.87881540.71762512.818420US-0.8660255.000000e-01-0.433884-0.9009692016-01-08
124668322016-01-15 08:27:4113.99-73.91148440.854698-73.88182140.88213015.811548US0.866025-5.000000e-01-0.433884-0.9009692016-01-15
115226122016-01-09 04:35:2110.98-73.92177640.767071-73.93313640.7745678.01954US0.8660255.000000e-01-0.974928-0.2225212016-01-09
\n", + "
" + ], + "text/plain": [ + " vendorID lpepPickupDatetime passengerCount tripDistance \\\n", + "1379860 2 2016-01-14 06:39:00 1 1.23 \n", + "377548 2 2016-01-01 06:22:01 5 0.91 \n", + "473976 2 2016-01-08 20:55:49 6 3.42 \n", + "1246683 2 2016-01-15 08:27:41 1 3.99 \n", + "1152261 2 2016-01-09 04:35:21 1 0.98 \n", + "\n", + " pickupLongitude pickupLatitude dropoffLongitude dropoffLatitude \\\n", + "1379860 -73.911827 40.775372 -73.899635 40.768333 \n", + "377548 -73.962044 40.709797 -73.946716 40.706902 \n", + "473976 -73.904823 40.741776 -73.878815 40.717625 \n", + "1246683 -73.911484 40.854698 -73.881821 40.882130 \n", + "1152261 -73.921776 40.767071 -73.933136 40.774567 \n", + "\n", + " totalAmount month_num day_of_month day_of_week hour_of_day \\\n", + "1379860 7.3 1 14 3 6 \n", + "377548 6.3 1 1 4 6 \n", + "473976 12.8 1 8 4 20 \n", + "1246683 15.8 1 15 4 8 \n", + "1152261 8.0 1 9 5 4 \n", + "\n", + " country_code hr_sin hr_cos dy_sin dy_cos datetime \n", + "1379860 US 1.000000 6.123234e-17 0.433884 -0.900969 2016-01-14 \n", + "377548 US 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n", + "473976 US -0.866025 5.000000e-01 -0.433884 -0.900969 2016-01-08 \n", + "1246683 US 0.866025 -5.000000e-01 -0.433884 -0.900969 2016-01-15 \n", + "1152261 US 0.866025 5.000000e-01 -0.974928 -0.222521 2016-01-09 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "columns_to_remove = [\"lpepDropoffDatetime\", \"puLocationId\", \"doLocationId\", \"extra\", \"mtaTax\",\n", + " \"improvementSurcharge\", \"tollsAmount\", \"ehailFee\", \"tripType\", \"rateCodeID\", \n", + " \"storeAndFwdFlag\", \"paymentType\", \"fareAmount\", \"tipAmount\"]\n", + "\n", + "green_taxi_df.drop(columns_to_remove, axis=1, inplace=True)\n", + "\n", + "green_taxi_df[\"datetime\"] = green_taxi_df[\"lpepPickupDatetime\"].dt.normalize()\n", + "green_taxi_df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Enrich with Holiday Data\n", + "\n", + "Now that the taxi data is downloaded and roughly prepared, add in holiday data as additional features. Holiday-specific features will assist model accuracy, as major holidays are times where taxi demand increases dramatically and supply becomes limited. The holiday dataset is relatively small, so fetch the full set by using the `PublicHolidays` class constructor with no parameters for filtering. Preview the data to check the format." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpya4i60qp\\https%3A\\%2Fazureopendatastorage.azurefd.net\\holidaydatacontainer\\Processed\\part-00000-tid-8468414522853579044-35925ba8-a227-4b80-9c89-17065e7bf1db-649-c000.snappy.parquet\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryOrRegionholidayNamenormalizeHolidayNameisPaidTimeOffcountryRegionCodedate
19375ArgentinaAño Nuevo [New Year's Day]Año Nuevo [New Year's Day]NoneAR2008-01-01
19376AustraliaNew Year's DayNew Year's DayNoneAU2008-01-01
19377AustriaNeujahrNeujahrNoneAT2008-01-01
19378BelarusНовый годНовый годNoneBY2008-01-01
19379BelgiumNieuwjaarsdagNieuwjaarsdagNoneBE2008-01-01
\n", + "
" + ], + "text/plain": [ + " countryOrRegion holidayName normalizeHolidayName \\\n", + "19375 Argentina Año Nuevo [New Year's Day] Año Nuevo [New Year's Day] \n", + "19376 Australia New Year's Day New Year's Day \n", + "19377 Austria Neujahr Neujahr \n", + "19378 Belarus Новый год Новый год \n", + "19379 Belgium Nieuwjaarsdag Nieuwjaarsdag \n", + "\n", + " isPaidTimeOff countryRegionCode date \n", + "19375 None AR 2008-01-01 \n", + "19376 None AU 2008-01-01 \n", + "19377 None AT 2008-01-01 \n", + "19378 None BY 2008-01-01 \n", + "19379 None BE 2008-01-01 " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from azureml.opendatasets import PublicHolidays\n", + "\n", + "# call default constructor to download full dataset\n", + "holidays_df = PublicHolidays().to_pandas_dataframe()\n", + "holidays_df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rename the `countryRegionCode` and `date` columns to match the respective field names from the taxi data, and also normalize the time so it can be used as a key. Next, join the holiday data with the taxi data by performing a left-join using the Pandas `merge()` function. This will preserve all records from `green_taxi_df`, but add in holiday data where it exists for the corresponding `datetime` and `country_code`, which in this case is always `\\\"US\\\"`. Preview the data to verify that they were merged correctly." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDlpepPickupDatetimepassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_num...day_of_weekhour_of_daycountry_codehr_sinhr_cosdy_sindy_cosdatetimenormalizeHolidayNameisPaidTimeOff
122016-01-01 06:22:0150.91-73.96204440.709797-73.94671640.7069026.301...46US1.0000006.123234e-17-0.433884-0.9009692016-01-01New Year's DayTrue
2522016-01-01 06:14:4312.44-73.99357640.681519-73.99959640.65593010.301...46US1.0000006.123234e-17-0.433884-0.9009692016-01-01New Year's DayTrue
2722016-01-01 16:06:3314.57-73.96250940.687862-73.98136140.73275822.251...416US-0.866025-5.000000e-01-0.433884-0.9009692016-01-01New Year's DayTrue
4422016-01-18 11:46:27116.10-73.92552240.827877-73.93498240.68127850.301...011US0.258819-9.659258e-010.0000001.0000002016-01-18Martin Luther King Jr. DayNone
4522016-01-01 10:41:3913.33-73.96289140.711971-73.91806040.73683212.801...410US0.500000-8.660254e-01-0.433884-0.9009692016-01-01New Year's DayTrue
..................................................................
2386822016-12-25 00:21:2312.36NaNNaNNaNNaN12.3012...60US0.0000001.000000e+00-0.7818310.6234902016-12-25Christmas DayTrue
2389222016-12-25 14:05:4811.05NaNNaNNaNNaN12.3012...614US-0.500000-8.660254e-01-0.7818310.6234902016-12-25Christmas DayTrue
2394212016-12-26 01:43:5710.80NaNNaNNaNNaN7.5512...01US0.2588199.659258e-010.0000001.0000002016-12-26Christmas DayTrue
2397822016-12-26 03:38:3311.55NaNNaNNaNNaN8.3012...03US0.7071077.071068e-010.0000001.0000002016-12-26Christmas DayTrue
2398522016-12-26 22:12:1813.77NaNNaNNaNNaN16.2512...022US-0.5000008.660254e-010.0000001.0000002016-12-26Christmas DayTrue
\n", + "

673 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " vendorID lpepPickupDatetime passengerCount tripDistance \\\n", + "1 2 2016-01-01 06:22:01 5 0.91 \n", + "25 2 2016-01-01 06:14:43 1 2.44 \n", + "27 2 2016-01-01 16:06:33 1 4.57 \n", + "44 2 2016-01-18 11:46:27 1 16.10 \n", + "45 2 2016-01-01 10:41:39 1 3.33 \n", + "... ... ... ... ... \n", + "23868 2 2016-12-25 00:21:23 1 2.36 \n", + "23892 2 2016-12-25 14:05:48 1 1.05 \n", + "23942 1 2016-12-26 01:43:57 1 0.80 \n", + "23978 2 2016-12-26 03:38:33 1 1.55 \n", + "23985 2 2016-12-26 22:12:18 1 3.77 \n", + "\n", + " pickupLongitude pickupLatitude dropoffLongitude dropoffLatitude \\\n", + "1 -73.962044 40.709797 -73.946716 40.706902 \n", + "25 -73.993576 40.681519 -73.999596 40.655930 \n", + "27 -73.962509 40.687862 -73.981361 40.732758 \n", + "44 -73.925522 40.827877 -73.934982 40.681278 \n", + "45 -73.962891 40.711971 -73.918060 40.736832 \n", + "... ... ... ... ... \n", + "23868 NaN NaN NaN NaN \n", + "23892 NaN NaN NaN NaN \n", + "23942 NaN NaN NaN NaN \n", + "23978 NaN NaN NaN NaN \n", + "23985 NaN NaN NaN NaN \n", + "\n", + " totalAmount month_num ... day_of_week hour_of_day country_code \\\n", + "1 6.30 1 ... 4 6 US \n", + "25 10.30 1 ... 4 6 US \n", + "27 22.25 1 ... 4 16 US \n", + "44 50.30 1 ... 0 11 US \n", + "45 12.80 1 ... 4 10 US \n", + "... ... ... ... ... ... ... \n", + "23868 12.30 12 ... 6 0 US \n", + "23892 12.30 12 ... 6 14 US \n", + "23942 7.55 12 ... 0 1 US \n", + "23978 8.30 12 ... 0 3 US \n", + "23985 16.25 12 ... 0 22 US \n", + "\n", + " hr_sin hr_cos dy_sin dy_cos datetime \\\n", + "1 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n", + "25 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n", + "27 -0.866025 -5.000000e-01 -0.433884 -0.900969 2016-01-01 \n", + "44 0.258819 -9.659258e-01 0.000000 1.000000 2016-01-18 \n", + "45 0.500000 -8.660254e-01 -0.433884 -0.900969 2016-01-01 \n", + "... ... ... ... ... ... \n", + "23868 0.000000 1.000000e+00 -0.781831 0.623490 2016-12-25 \n", + "23892 -0.500000 -8.660254e-01 -0.781831 0.623490 2016-12-25 \n", + "23942 0.258819 9.659258e-01 0.000000 1.000000 2016-12-26 \n", + "23978 0.707107 7.071068e-01 0.000000 1.000000 2016-12-26 \n", + "23985 -0.500000 8.660254e-01 0.000000 1.000000 2016-12-26 \n", + "\n", + " normalizeHolidayName isPaidTimeOff \n", + "1 New Year's Day True \n", + "25 New Year's Day True \n", + "27 New Year's Day True \n", + "44 Martin Luther King Jr. Day None \n", + "45 New Year's Day True \n", + "... ... ... \n", + "23868 Christmas Day True \n", + "23892 Christmas Day True \n", + "23942 Christmas Day True \n", + "23978 Christmas Day True \n", + "23985 Christmas Day True \n", + "\n", + "[673 rows x 21 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "holidays_df = holidays_df.rename(columns={\"countryRegionCode\": \"country_code\"})\n", + "holidays_df[\"datetime\"] = holidays_df[\"date\"].dt.normalize()\n", + "\n", + "holidays_df.drop([\"countryOrRegion\", \"holidayName\", \"date\"], axis=1, inplace=True)\n", + "\n", + "taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how=\"left\", on=[\"datetime\", \"country_code\"])\n", + "taxi_holidays_df[taxi_holidays_df[\"normalizeHolidayName\"].notnull()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Enrich with weather data\n", + "\n", + "Now NOAA surface weather data can be appended to the taxi and holiday data. Use a similar approach to fetch the weather data by downloading one month at a time iteratively. Additionally, specify the `cols` parameter with an array of strings to filter the columns to download. This is a very large dataset containing weather surface data from all over the world, so before appending each month, filter the lat/long fields to near NYC using the `query()` function on the dataframe. This will ensure the `weather_df` doesn't get too large." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-1.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-2.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-3.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-4.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-5.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-6.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-7.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-8.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-9.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-10.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-11.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-12.c000.snappy.parquet\n", + "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-12.c000.snappy.parquet\n" + ] + } + ], + "source": [ + "from azureml.opendatasets import NoaaIsdWeather\n", + "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n", + "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n", + "\n", + "weather_df = pd.concat([NoaaIsdWeather(cols=[\"temperature\", \"precipTime\", \"precipDepth\"], start_date=start + relativedelta(months=x), end_date=end + relativedelta(months=x))\\\n", + " .to_pandas_dataframe().query(\"latitude>=40.53 and latitude<=40.88 and longitude>=-74.09 and longitude<=-73.72 and temperature==temperature\") for x in range(12)])" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
wbanlatitudetemperatureusafdatetimelongitudeprecipDepthprecipTime
2046471473240.7832.87250302016-01-02 03:00:00-73.867NaNNaN
2046701473240.779-4.47250302016-01-22 13:51:00-73.8800.01.0
2046941473240.7795.07250302016-01-08 02:51:00-73.8800.01.0
2047011473240.779-1.17250302016-01-04 15:51:00-73.8800.01.0
2047151473240.7794.47250302016-01-01 21:51:00-73.8800.01.0
...........................
12484719472840.7894.47250532016-12-23 13:51:00-73.9670.01.0
12485559472840.7895.07250532016-12-12 13:51:00-73.9670.01.0
12485809472840.7893.97250532016-12-18 07:01:00-73.967NaNNaN
12485979472840.7897.87250532016-12-25 00:51:00-73.9670.01.0
12486009472840.789-2.87250532016-12-17 11:10:00-73.9675.01.0
\n", + "

55683 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " wban latitude temperature usaf datetime longitude \\\n", + "204647 14732 40.783 2.8 725030 2016-01-02 03:00:00 -73.867 \n", + "204670 14732 40.779 -4.4 725030 2016-01-22 13:51:00 -73.880 \n", + "204694 14732 40.779 5.0 725030 2016-01-08 02:51:00 -73.880 \n", + "204701 14732 40.779 -1.1 725030 2016-01-04 15:51:00 -73.880 \n", + "204715 14732 40.779 4.4 725030 2016-01-01 21:51:00 -73.880 \n", + "... ... ... ... ... ... ... \n", + "1248471 94728 40.789 4.4 725053 2016-12-23 13:51:00 -73.967 \n", + "1248555 94728 40.789 5.0 725053 2016-12-12 13:51:00 -73.967 \n", + "1248580 94728 40.789 3.9 725053 2016-12-18 07:01:00 -73.967 \n", + "1248597 94728 40.789 7.8 725053 2016-12-25 00:51:00 -73.967 \n", + "1248600 94728 40.789 -2.8 725053 2016-12-17 11:10:00 -73.967 \n", + "\n", + " precipDepth precipTime \n", + "204647 NaN NaN \n", + "204670 0.0 1.0 \n", + "204694 0.0 1.0 \n", + "204701 0.0 1.0 \n", + "204715 0.0 1.0 \n", + "... ... ... \n", + "1248471 0.0 1.0 \n", + "1248555 0.0 1.0 \n", + "1248580 NaN NaN \n", + "1248597 0.0 1.0 \n", + "1248600 5.0 1.0 \n", + "\n", + "[55683 rows x 8 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weather_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again call `pandas.Series.dt.normalize` on the `datetime` field in the weather data so it matches the time key in `taxi_holidays_df`.\n", + "\n", + "\n", + "Next group the weather data to have daily aggregated weather values. Define a dict `aggregations` to define how to aggregate each field at a daily level. For`temperature` take the mean and for `precipTime` and `precipDepth` take the daily maximum. Use the `groupby()` function along with the aggregations to group the data. Preview the data to ensure there is one record per day." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
precipTimetemperatureprecipDepth
datetime
2016-01-011.05.1973450.0
2016-01-021.02.5678570.0
2016-01-031.03.8464290.0
2016-01-041.00.1238940.0
2016-01-056.0-7.2062500.0
2016-01-066.0-0.8963960.0
2016-01-076.03.1806450.0
2016-01-081.04.3840910.0
2016-01-096.06.7102743.0
2016-01-1024.010.943655254.0
\n", + "
" + ], + "text/plain": [ + " precipTime temperature precipDepth\n", + "datetime \n", + "2016-01-01 1.0 5.197345 0.0\n", + "2016-01-02 1.0 2.567857 0.0\n", + "2016-01-03 1.0 3.846429 0.0\n", + "2016-01-04 1.0 0.123894 0.0\n", + "2016-01-05 6.0 -7.206250 0.0\n", + "2016-01-06 6.0 -0.896396 0.0\n", + "2016-01-07 6.0 3.180645 0.0\n", + "2016-01-08 1.0 4.384091 0.0\n", + "2016-01-09 6.0 6.710274 3.0\n", + "2016-01-10 24.0 10.943655 254.0" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weather_df[\"datetime\"] = weather_df[\"datetime\"].dt.normalize()\n", + "\n", + "# group by datetime\n", + "aggregations = {\"precipTime\": \"max\", \"temperature\": \"mean\", \"precipDepth\": \"max\"}\n", + "weather_df_grouped = weather_df.groupby(\"datetime\").agg(aggregations)\n", + "weather_df_grouped.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: The examples in this tutorial merge data using Pandas functions and custom aggregations, but the Open Datasets SDK has classes designed to easily merge and enrich data sets. See the [notebook](https://github.com/Azure/OpenDatasetsNotebooks/blob/master/tutorials/data-join/04-nyc-taxi-join-weather-in-pandas.ipynb) for code examples of these design patterns." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cleanse data\n", + "\n", + "Merge the existing taxi and holiday data with the new weather data. This time `datetime` is the only key, and again perform a left-join of the data. Run the `describe()` function on the new dataframe to see summary statistics for each field." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDpassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_numday_of_monthday_of_weekhour_of_dayhr_sinhr_cosdy_sindy_cosprecipTimetemperatureprecipDepth
count24000.00000024000.00000024000.00000012000.00000012000.00000012000.00000012000.00000024000.00000024000.00000024000.00000024000.00000024000.00000024000.0000002.400000e+0424000.00000024000.00000024000.00000024000.00000024000.000000
mean1.7896671.3552922.830398-73.81439340.678791-73.83701940.69072914.6682516.50000015.0687503.24779213.582875-0.239687-1.510585e-02-0.079292-0.05963013.31866713.8782721037.956292
std0.4075541.0200183.1183023.0163851.6631522.6986091.48803211.7385323.4521248.4775551.9512096.7083720.6675287.048175e-010.7144570.69264010.3331629.4844432788.844868
min1.0000000.0000000.000000-74.1648250.000000-75.1864400.000000-200.0000001.0000001.0000000.0000000.000000-1.000000-1.000000e+00-0.974928-0.9009691.000000-13.3794640.000000
25%2.0000001.0000001.040000-73.96137040.693539-73.96751440.6951287.8800003.7500008.0000002.0000009.000000-0.866025-7.071068e-01-0.781831-0.9009696.0000006.6207730.000000
50%2.0000001.0000001.840000-73.94713240.745928-73.94586940.74591411.3000006.50000015.0000003.00000015.000000-0.500000-1.836970e-160.000000-0.2225216.00000013.10832310.000000
75%2.0000001.0000003.500000-73.91963840.802049-73.91305940.79107617.7500009.25000022.0000005.00000019.0000000.2588197.071068e-010.7818310.62349024.00000022.944737127.000000
max2.0000007.000000106.6800000.00000041.0810470.00000041.081055450.00000012.00000030.0000006.00000023.0000001.0000001.000000e+000.9749281.00000024.00000031.3036659999.000000
\n", + "
" + ], + "text/plain": [ + " vendorID passengerCount tripDistance pickupLongitude \\\n", + "count 24000.000000 24000.000000 24000.000000 12000.000000 \n", + "mean 1.789667 1.355292 2.830398 -73.814393 \n", + "std 0.407554 1.020018 3.118302 3.016385 \n", + "min 1.000000 0.000000 0.000000 -74.164825 \n", + "25% 2.000000 1.000000 1.040000 -73.961370 \n", + "50% 2.000000 1.000000 1.840000 -73.947132 \n", + "75% 2.000000 1.000000 3.500000 -73.919638 \n", + "max 2.000000 7.000000 106.680000 0.000000 \n", + "\n", + " pickupLatitude dropoffLongitude dropoffLatitude totalAmount \\\n", + "count 12000.000000 12000.000000 12000.000000 24000.000000 \n", + "mean 40.678791 -73.837019 40.690729 14.668251 \n", + "std 1.663152 2.698609 1.488032 11.738532 \n", + "min 0.000000 -75.186440 0.000000 -200.000000 \n", + "25% 40.693539 -73.967514 40.695128 7.880000 \n", + "50% 40.745928 -73.945869 40.745914 11.300000 \n", + "75% 40.802049 -73.913059 40.791076 17.750000 \n", + "max 41.081047 0.000000 41.081055 450.000000 \n", + "\n", + " month_num day_of_month day_of_week hour_of_day hr_sin \\\n", + "count 24000.000000 24000.000000 24000.000000 24000.000000 24000.000000 \n", + "mean 6.500000 15.068750 3.247792 13.582875 -0.239687 \n", + "std 3.452124 8.477555 1.951209 6.708372 0.667528 \n", + "min 1.000000 1.000000 0.000000 0.000000 -1.000000 \n", + "25% 3.750000 8.000000 2.000000 9.000000 -0.866025 \n", + "50% 6.500000 15.000000 3.000000 15.000000 -0.500000 \n", + "75% 9.250000 22.000000 5.000000 19.000000 0.258819 \n", + "max 12.000000 30.000000 6.000000 23.000000 1.000000 \n", + "\n", + " hr_cos dy_sin dy_cos precipTime temperature \\\n", + "count 2.400000e+04 24000.000000 24000.000000 24000.000000 24000.000000 \n", + "mean -1.510585e-02 -0.079292 -0.059630 13.318667 13.878272 \n", + "std 7.048175e-01 0.714457 0.692640 10.333162 9.484443 \n", + "min -1.000000e+00 -0.974928 -0.900969 1.000000 -13.379464 \n", + "25% -7.071068e-01 -0.781831 -0.900969 6.000000 6.620773 \n", + "50% -1.836970e-16 0.000000 -0.222521 6.000000 13.108323 \n", + "75% 7.071068e-01 0.781831 0.623490 24.000000 22.944737 \n", + "max 1.000000e+00 0.974928 1.000000 24.000000 31.303665 \n", + "\n", + " precipDepth \n", + "count 24000.000000 \n", + "mean 1037.956292 \n", + "std 2788.844868 \n", + "min 0.000000 \n", + "25% 0.000000 \n", + "50% 10.000000 \n", + "75% 127.000000 \n", + "max 9999.000000 " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how=\"left\", on=[\"datetime\"])\n", + "taxi_holidays_weather_df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the summary statistics, you see that there are several fields that have outliers or values that will reduce model accuracy. First filter the lat/long fields to be within the same bounds you used for filtering weather data. The `tripDistance` field has some bad data, because the minimum value is negative. The `passengerCount` field has bad data as well, with the max value being 210 passengers. Lastly, the `totalAmount` field has negative values, which don't make sense in the context of our model.\n", + "\n", + "Filter out these anomolies using query functions, and then remove the last few columns unnecesary for training.\n", + "\n", + "Note: since a random sample of 2000 was taken for each month of the taxi data, the statistics may vary each time this is ran." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "final_df = taxi_holidays_weather_df.query(\"pickupLatitude>=40.53 and pickupLatitude<=40.88 and \\\n", + " pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \\\n", + " tripDistance>0 and tripDistance<75 and \\\n", + " passengerCount>0 and passengerCount<100 and \\\n", + " totalAmount>0\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Call `describe()` again on the data to ensure cleansing worked as expected. The final data is prepared and cleansed, consisting of taxi, holiday, and weather data, and is ready to use for machine learning model training." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vendorIDpassengerCounttripDistancepickupLongitudepickupLatitudedropoffLongitudedropoffLatitudetotalAmountmonth_numday_of_monthday_of_weekhour_of_dayhr_sinhr_cosdy_sindy_cosprecipTimetemperatureprecipDepth
count11763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.00000011763.0000001.176300e+0411763.00000011763.00000011763.00000011763.00000011763.000000
mean1.7901901.3692942.841407-73.93791140.746224-73.91090140.73081814.5579173.50131814.9292703.25231713.538553-0.236544-2.265927e-03-0.070226-0.05905911.99396410.288261192.179546
std0.4071911.0416342.8298640.0411210.0568181.3641140.7534689.9891651.7073508.4757931.9481276.7780120.6688127.048492e-010.7188710.68912210.1147758.5300111223.101074
min1.0000001.0000000.010000-74.03519440.572906-74.1830290.0000000.0100001.0000001.0000000.0000000.000000-1.000000-1.000000e+00-0.974928-0.9009691.000000-13.3794640.000000
25%2.0000001.0000001.090000-73.96160140.693594-73.96779340.6954408.1600002.0000008.0000002.0000009.000000-0.866025-7.071068e-01-0.781831-0.9009691.0000003.5045800.000000
50%2.0000001.0000001.900000-73.94751740.745842-73.94624340.74578911.3000004.00000015.0000003.00000015.000000-0.500000-1.836970e-160.000000-0.2225216.00000010.4682763.000000
75%2.0000001.0000003.530000-73.92050940.801752-73.91380740.78994217.3800005.00000022.0000005.00000019.0000000.2588197.071068e-010.7818310.62349024.00000016.96692341.000000
max2.0000006.00000038.850000-73.73889940.8799820.00000041.073185123.8000006.00000030.0000006.00000023.0000001.0000001.000000e+000.9749281.00000024.00000026.5241079999.000000
\n", + "
" + ], + "text/plain": [ + " vendorID passengerCount tripDistance pickupLongitude \\\n", + "count 11763.000000 11763.000000 11763.000000 11763.000000 \n", + "mean 1.790190 1.369294 2.841407 -73.937911 \n", + "std 0.407191 1.041634 2.829864 0.041121 \n", + "min 1.000000 1.000000 0.010000 -74.035194 \n", + "25% 2.000000 1.000000 1.090000 -73.961601 \n", + "50% 2.000000 1.000000 1.900000 -73.947517 \n", + "75% 2.000000 1.000000 3.530000 -73.920509 \n", + "max 2.000000 6.000000 38.850000 -73.738899 \n", + "\n", + " pickupLatitude dropoffLongitude dropoffLatitude totalAmount \\\n", + "count 11763.000000 11763.000000 11763.000000 11763.000000 \n", + "mean 40.746224 -73.910901 40.730818 14.557917 \n", + "std 0.056818 1.364114 0.753468 9.989165 \n", + "min 40.572906 -74.183029 0.000000 0.010000 \n", + "25% 40.693594 -73.967793 40.695440 8.160000 \n", + "50% 40.745842 -73.946243 40.745789 11.300000 \n", + "75% 40.801752 -73.913807 40.789942 17.380000 \n", + "max 40.879982 0.000000 41.073185 123.800000 \n", + "\n", + " month_num day_of_month day_of_week hour_of_day hr_sin \\\n", + "count 11763.000000 11763.000000 11763.000000 11763.000000 11763.000000 \n", + "mean 3.501318 14.929270 3.252317 13.538553 -0.236544 \n", + "std 1.707350 8.475793 1.948127 6.778012 0.668812 \n", + "min 1.000000 1.000000 0.000000 0.000000 -1.000000 \n", + "25% 2.000000 8.000000 2.000000 9.000000 -0.866025 \n", + "50% 4.000000 15.000000 3.000000 15.000000 -0.500000 \n", + "75% 5.000000 22.000000 5.000000 19.000000 0.258819 \n", + "max 6.000000 30.000000 6.000000 23.000000 1.000000 \n", + "\n", + " hr_cos dy_sin dy_cos precipTime temperature \\\n", + "count 1.176300e+04 11763.000000 11763.000000 11763.000000 11763.000000 \n", + "mean -2.265927e-03 -0.070226 -0.059059 11.993964 10.288261 \n", + "std 7.048492e-01 0.718871 0.689122 10.114775 8.530011 \n", + "min -1.000000e+00 -0.974928 -0.900969 1.000000 -13.379464 \n", + "25% -7.071068e-01 -0.781831 -0.900969 1.000000 3.504580 \n", + "50% -1.836970e-16 0.000000 -0.222521 6.000000 10.468276 \n", + "75% 7.071068e-01 0.781831 0.623490 24.000000 16.966923 \n", + "max 1.000000e+00 0.974928 1.000000 24.000000 26.524107 \n", + "\n", + " precipDepth \n", + "count 11763.000000 \n", + "mean 192.179546 \n", + "std 1223.101074 \n", + "min 0.000000 \n", + "25% 0.000000 \n", + "50% 3.000000 \n", + "75% 41.000000 \n", + "max 9999.000000 " + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train a model\n", + "\n", + "The data is ready to train a machine learning model." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.linear_model import RidgeCV\n", + "from sklearn.linear_model import Ridge\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training Function\n", + "\n", + "Define a function that can be used to create a model pipeline that can be trained and then used for scoring. This pipeline has 2 steps: preprocessing and model training.\n", + "\n", + "Preprocessing Stages:\n", + "The preprocessing step of the pipeline also has 2 stages, one for numerical features and one for categorical features.\n", + "For the numerical features, let's fill in any blanks with 0's. While the training data may not have any nulls in the these fields, future data that is scored may and this step will take care of those for us. Optionally, a scaler transformation could be added in this step as well. Similarly for the categorical variables, let's have the null values filled with \"MISSING\". Additionally to the categorical variables, these will need to be one hot encoded, so we will include that step in our pipeline.\n", + "\n", + "Model Training Stage:\n", + "An input parameter will determine which type of model of train. Let's test out a linear regression and random forest model to start. \n", + "\n", + "The two steps are put together into the pipeline which is what the function is returning." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "def createClassModel(algo_name, catg, nums):\n", + " numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])\n", + " \n", + " categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=\"MISSING\")), ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n", + " \n", + " preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)])\n", + " \n", + " if algo_name == 'linear_regression':\n", + " model=Ridge(alpha=100)\n", + " elif algo_name == 'random_forest':\n", + " model = RandomForestRegressor()\n", + " else:\n", + " pass\n", + " ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), (\"model\", model)])\n", + " return ModelPipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define the arguments that will be passed to the function. `catg_cols` is a list of the categorical variables that will be transformed in our processing step. `num_cols` is a list of the numerical variables that will be transformed in our processing step. Let's define the target column as `label` so it can be used in future steps as well." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "catg_cols = [\"vendorID\", \"month_num\", \"day_of_month\", \"normalizeHolidayName\", \"isPaidTimeOff\"]\n", + "num_cols = [\"passengerCount\", \"tripDistance\", \"precipTime\", \"temperature\", \"precipDepth\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]\n", + "label = [\"totalAmount\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The training is ready to begin, but first, let's make sure that the categorical variables are strings in our dataframe to ensure no errors in our pipeline. \n", + "\n", + "Next, the data is split into training and test sets by using the `train_test_split()` function in the `scikit-learn` library. The `test_size` parameter determines the percentage of data to allocate to testing. The `random_state` parameter sets a seed to the random number generator, so that your train-test splits are deterministic.\n", + "\n", + "The training will happen in the for loop so that both algorithms can be tested. The createClassModel funtion is called to retreive the pipeline that can then be trained using the training dataset. \n", + "\n", + "Once trained, the test dataset is then ran through the model to test the model's performance. Using various functions from sklearn.metrics, the R2 score, MAPE, and RMSE can be used to measure model performance." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "linear_regression\n", + "R2: 0.8034971051723139\n", + "MAPE: 0.15888983234876766\n", + "RMSE: 4.606544019524053\n", + "\n", + "random_forest\n", + "R2: 0.8073017231520601\n", + "MAPE: 0.14715914748857337\n", + "RMSE: 4.5617309259357475\n", + "\n" + ] + } + ], + "source": [ + "# make sure categorical columns are strings\n", + "final_df[catg_cols] = final_df[catg_cols].astype(\"str\")\n", + "\n", + "# split data\n", + "X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222)\n", + "\n", + "# test 2 algorithms\n", + "for algorithmname in [\"linear_regression\", 'random_forest']:\n", + " fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline\n", + " fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine\n", + "\n", + " y_pred = fitPipeline.predict(X_test) # score with fitted pipeline\n", + "\n", + " # Evaluate\n", + " r2 = r2_score(y_test, y_pred)\n", + " mape = mean_absolute_percentage_error(y_test, y_pred)\n", + " rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n", + "\n", + " print(algorithmname)\n", + " print(\"R2:\", r2)\n", + " print(\"MAPE:\", mape)\n", + " print(\"RMSE:\", rmse)\n", + " print()" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "74e9702761b8f12846716a18132904990016d49f378e22e0e13a0e91318de754" + }, + "kernelspec": { + "display_name": "Python 3.8.12 ('mlopsenv')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/workshop/requirements-local.txt b/src/workshop/requirements-local.txt index dbdf82e1..cc424f71 100644 --- a/src/workshop/requirements-local.txt +++ b/src/workshop/requirements-local.txt @@ -1,7 +1,7 @@ -azureml-sdk==1.38.0 -azureml-mlflow==1.38.0 -azureml-opendatasets==1.38.0 -pandas==1.3.5 -scikit-learn==1.0.2 -importlib-metadata<3,>=0.12 +azureml-sdk==1.38.0 +azureml-mlflow==1.38.0 +azureml-opendatasets==1.38.0 +pandas==1.3.5 +scikit-learn==1.0.2 +importlib-metadata<3,>=0.12 msrest==0.6.21 \ No newline at end of file