diff --git a/hdinsight/settings.ps1 b/hdinsight/settings.ps1 index 77da914..afccf4f 100644 --- a/hdinsight/settings.ps1 +++ b/hdinsight/settings.ps1 @@ -1,26 +1,23 @@ # Use unique prefix for cluster resources -$prefix = "andrier" +$user = "andrie" + +# Use user as a prefix for related resources +$prefix = $user $settings = New-Object PSObject -Property @{ # Subscription SubscriptionName = "Visual Studio Ultimate with MSDN" # Storage account - StorageAccountName = "$($prefix)hadooptutorial" - StorageAccountLabel = "RHaddop-tutorial" + StorageAccountName = "$($prefix)rhadooptutorial" + StorageAccountLabel = "RHadoop-tutorial" StorageAccountLocation = "West Europe" # Cluster HDInsightClusterName = "$($prefix)-r-hadoop-tutorial" HDInsightContainerName = "$($prefix)-r-hadoop-tutorial-hdfs" - HDInsightUsername = "admin" + HDInsightUsername = $user HDInsightPassword = "RHadoopTutorial2015!" HDInsightClusterSizeInNodes = 2 HDInsightHeadNodeVMSize = "Large" - - # Custom version of the script referenced at: - # Install and use R on HDInsight Hadoop clusters - # https://azure.microsoft.com/en-us/documentation/articles/hdinsight-hadoop-r-scripts - RInstallerScriptUri = "https://raw.githubusercontent.com/StanislawSwierc/RHadoop-tutorial/master/hdinsight/r-installer.ps1" - #RInstallerScriptUri = "https://raw.githubusercontent.com/$($user)/RHadoop-tutorial/master/hdinsight/r-installer.ps1" } diff --git a/hdinsight/setup.ps1 b/hdinsight/setup.ps1 index 53423c6..07dda50 100644 --- a/hdinsight/setup.ps1 +++ b/hdinsight/setup.ps1 @@ -19,7 +19,7 @@ Add-AzureAccount Select-AzureSubscription $settings.SubscriptionName # Check if storage account exist and create it otherwise -if (!(Get-AzureStorageAccount -StorageAccountName $settings.StorageAccountName -ErrorAction SilentlyContinue)) { +if (!(Get-AzureStorageAccount -StorageAccountName $settings.StorageAccountName -ErrorAction SilentlyContinue -WarningAction silentlyContinue)) { Write-Host "Creating new storage account: $($settings.StorageAccountName)" New-AzureStorageAccount ` -StorageAccountName $settings.StorageAccountName ` @@ -35,6 +35,56 @@ Set-AzureSubscription ` # Get the storage account key $storageAccountKey = (Get-AzureStorageKey $settings.StorageAccountName).Primary +# Create the storage context object +$storageContext = New-AzureStorageContext -StorageAccountName $settings.StorageAccountName -StorageAccountKey $storageAccountKey + +# Check if container exists and create it otherwise +if (!(Get-AzureStorageContainer -Name $settings.HDInsightContainerName -ErrorAction SilentlyContinue)) { + Write-Host "Creating new storage container: $($settings.HDInsightContainerName)" + New-AzureStorageContainer -Name $settings.HDInsightContainerName +} + +# Check if the dataset exist and upload it otherwise. +$datasetPrefix = "user/$($settings.HDInsightUsername)/nyctaxitrips" +if (!(Get-AzureStorageBlob -Prefix $datasetPrefix -Container $settings.HDInsightContainerName)) { + # Dataset consists of 12 trip_data and 12 trip_fare files. They can be copied from + # a publicly available storage container. The most efficient way is to start asynchronous + # copy operations and wait for their completion. + for ($i=1; $i -le 12; $i++) { + Start-CopyAzureStorageBlob ` + -SrcUri "https://nyctaxitrips.blob.core.windows.net/data/trip_fare_$i.csv.zip" ` + -DestBlob "$datasetPrefix/fare/trip_fare_$i.csv.zip" ` + -DestContainer $settings.HDInsightContainerName ` + -DestContext $storageContext + + Start-CopyAzureStorageBlob ` + -SrcUri "https://nyctaxitrips.blob.core.windows.net/data/trip_data_$i.csv.zip" ` + -DestBlob "$datasetPrefix/data/trip_data_$i.csv.zip" ` + -DestContainer $settings.HDInsightContainerName ` + -DestContext $storageContext + } + + # Wait for all operations to complete. + $pendingBlobsCount = 24 + while ($pendingBlobsCount -gt 0) { + Write-Host $(Get-Date) : "Waiting for $pendingBlobsCount/24 asynchronous copy operations." + Start-Sleep -Seconds 60 + $pendingBlobsCount = (Get-AzureStorageBlob -Prefix $datasetPrefix -Container $settings.HDInsightContainerName | + Get-AzureStorageBlobCopyState | + ?{ $_.Status -eq "Pending" } | + Measure-Object).Count + } +} + +# Prepare R installation script. Nodes should be able to download it by its Uri. +$scriptActionBlob = Set-AzureStorageBlobContent ` + -Container $settings.HDInsightContainerName ` + -File "r-installer.ps1" ` + -Blob "user/$($settings.HDInsightUsername)/r-installer.ps1" ` + -Force +$scriptActionBlobToken = New-AzureStorageBlobSASToken -ICloudBlob $scriptActionBlob.ICloudBlob -Permission r -ExpiryTime (Get-Date).AddDays(1) +$scriptActionUri = $scriptActionBlob.ICloudBlob.Uri.AbsoluteUri + $scriptActionBlobToken + # Create cluster configuration $hdinsightConfig = New-AzureHDInsightClusterConfig ` -HeadNodeVMSize $settings.HDInsightHeadNodeVMSize ` @@ -44,9 +94,9 @@ $hdinsightConfig = New-AzureHDInsightClusterConfig ` -StorageAccountKey $storageAccountKey ` -StorageContainerName $settings.HDInsightContainerName | Add-AzureHDInsightScriptAction ` - -Name "Install R" ` + -Name "Install R (x64) on Head and Data nodes" ` -ClusterRoleCollection HeadNode,DataNode ` - -Uri $settings.RInstallerScriptUri + -Uri $scriptActionUri # Convert plain text user name and password to PSCredential object $hdinsightPasswordSecureString = ConvertTo-SecureString -String $settings.HDInsightPassword -AsPlainText -Force @@ -54,13 +104,14 @@ $hdinsightCredential = New-Object -TypeName System.Management.Automation.PSCrede -ArgumentList $settings.HDInsightUsername, $hdinsightPasswordSecureString # Create cluster +Write-Host "$(Get-Date) : Creating new cluster" $hdinsightCluster = New-AzureHDInsightCluster ` -Name $settings.HDInsightClusterName ` -Config $hdinsightConfig ` -Credential $hdinsightCredential ` - -Location $settings.StorageAccountLocation + -Location $settings.StorageAccountLocation ` + -ErrorAction Continue +Write-Host "$(Get-Date) : Operation completed" # Check cluster state -Get-AzureHDInsightCluster -Name $settings.HDInsightClusterName - - +Get-AzureHDInsightCluster -Name $settings.HDInsightClusterName