diff --git a/cost-based-ml/README.md b/cost-based-ml/README.md index cb298e5..a1d9fa0 100644 --- a/cost-based-ml/README.md +++ b/cost-based-ml/README.md @@ -1,6 +1,6 @@ # Cost-based Machine Learning -So you've built an ML model and evaluated it's performance on a testing dataset? In case of binary classification, the evaluation tells you how many mistakes the model made, i.e. the percentage of false positives and false negatives, and the same stats for the correct behavior of the model, namely, true positives and true negatives. Of course, the fewer the errors, the better, but for any realistic application the percentage of errors is substantial and it is often unclear if the model is worth using. Moreover, if you just look at the total error rate (sum of false positives and false negatives) you may convince yourself that the model is useless. For example, suppose that 90% of the data points belong to class 0 and the rest to class 1, and your model gives 15% total error. This means that if you employ the model you will be making a mistake 15% of the time and if you don't use the model at all (and just assume that all data points belong to class 0) you will be making a mistake only in 10% of cases. Seems like the model is useless in this case, doesn'it? +So you've built an ML model and evaluated it's performance on a testing dataset? In case of binary classification, the evaluation tells you how many mistakes the model made, i.e. the percentage of false positives and false negatives, and the same stats for the correct behavior of the model, namely, true positives and true negatives. Of course, the fewer the errors, the better, but for any realistic application the percentage of errors is substantial and it is often unclear if the model is worth using. Moreover, if you just look at the total error rate (sum of false positives and false negatives) you may convince yourself that the model is useless. For example, suppose that 90% of the data points belong to class 0 and the rest to class 1, and your model gives 15% total error. This means that if you employ the model you will be making a mistake 15% of the time and if you don't use the model at all (and just assume that all data points belong to class 0) you will be making a mistake only in 10% of cases. Seems like the model is useless in this case, doesn't it? This, however, is a rather simplistic way of looking the model evaluation. The truth is that the different types of mistakes the model makes have different intrinsic costs associated with them, depending on the domain and application. Frequently, even when the total error looks bad, when costs are taken into account, the end result clearly favors the use of ML. diff --git a/cost-based-ml/cost_based_ml.py b/cost-based-ml/cost_based_ml.py index a295952..ed6ae0d 100755 --- a/cost-based-ml/cost_based_ml.py +++ b/cost-based-ml/cost_based_ml.py @@ -37,7 +37,7 @@ def batch_prediction_data_bucket_key(output_uri_s3, batch_prediction_id): key += "batch-prediction/result/{}-{}.gz".format(batch_prediction_id, datasource_filename) return bucket, key -# read batch prediction results from S3 and turn them into an numpy array +# read batch prediction results from S3 and turn them into a numpy array def read_test_predictions(bucket, key): s3 = boto3.resource('s3') obj = s3.Object(bucket, key) @@ -52,7 +52,7 @@ def read_test_predictions(bucket, key): data = np.loadtxt(StringIO(predictions_str), dtype = {'names': names, 'formats': formats}, delimiter=',', skiprows=1, usecols=cols) return data -# this historgram replicates what the Amazon ML console is showing for model evaluation +# this histogram replicates what the Amazon ML console is showing for model evaluation def plot_class_histograms(score_n_true_label): class_1_scores = [score for (score, true_label) in score_n_true_label if true_label == 1] class_0_scores = [score for (score, true_label) in score_n_true_label if true_label == 0] diff --git a/k-fold-cross-validation/README.md b/k-fold-cross-validation/README.md index ed52319..3fec4e2 100644 --- a/k-fold-cross-validation/README.md +++ b/k-fold-cross-validation/README.md @@ -33,7 +33,7 @@ If you are a Python 2 developer and do not already have `virtualenv` and `pip` t sudo apt-get update sudo apt-get install python-pip python-virtualenv -Users of other operating systems and package managers can learn more about installing `pip` [here](http://pip.readthedocs.org/en/stable/installing/), and about installing `virtualenv` [here](http://virtualenv.readthedocs.org/en/latest/installation.html). +Users of other operating systems and package managers can learn more about [installing `pip`](http://pip.readthedocs.org/en/stable/installing/), and about [installing `virtualenv`](http://virtualenv.readthedocs.org/en/latest/installation.html). After you’ve installed the `virtualenv` and `pip` tools, run: diff --git a/k-fold-cross-validation/collect_perf.py b/k-fold-cross-validation/collect_perf.py index 43ae362..8d214d4 100755 --- a/k-fold-cross-validation/collect_perf.py +++ b/k-fold-cross-validation/collect_perf.py @@ -119,7 +119,7 @@ def collect_perf(eval_id_list): kfolds = len(eval_id_list) eval_auc_map = collect_perf(eval_id_list) # start polling & collect - # Comput the mean/variance of auc scores. Casting kfolds to float for + # Compute the mean/variance of auc scores. Casting kfolds to float for # Python 2 compatibility. avg_auc = sum([x for x in eval_auc_map.values()]) / float(kfolds) var_auc = sum([(x - avg_auc) ** 2 for x in eval_auc_map.values()]) / float( diff --git a/ml-tools-python/wait_for_entity.py b/ml-tools-python/wait_for_entity.py index 04e6754..ff89975 100755 --- a/ml-tools-python/wait_for_entity.py +++ b/ml-tools-python/wait_for_entity.py @@ -22,7 +22,7 @@ ev = evaluation bp = batch prediction -Useage: +Usage: python wait_for_entity.py entity_id [entity_type] """ import boto diff --git a/social-media/README.md b/social-media/README.md index 9c3f87b..0ee8399 100644 --- a/social-media/README.md +++ b/social-media/README.md @@ -85,8 +85,7 @@ To gather the training data, run the following command: Substitute your company's twitter handle instead of @awscloud and configure your Twitter API credentials in config.py. Learn how to -obtain your credentials -[here](https://dev.twitter.com/oauth/overview/application-owner-access-tokens). +[obtain your credentials](https://dev.twitter.com/oauth/overview/application-owner-access-tokens). This will produce a file called `line_separated_tweets_json.txt` that other scripts will read later. @@ -218,7 +217,7 @@ This script requires that `config.py` is present and contains appropriate values. Description of the configuration required in `config.py` is as follows: -* *awsAccountId* : The AWS Account Id corresponding to the credentials being used +* *awsAccountId* : The AWS Account ID corresponding to the credentials being used with boto. See [docs](http://docs.aws.amazon.com/general/latest/gr/acct-identifiers.html) for details. * *kinesisStream* : The name being given to the Kinesis stream. See diff --git a/social-media/push-json-to-kinesis.py b/social-media/push-json-to-kinesis.py index 5872acd..503b0b9 100755 --- a/social-media/push-json-to-kinesis.py +++ b/social-media/push-json-to-kinesis.py @@ -14,7 +14,7 @@ """ Utility to call Amazon Kinesis stream using payload from a file that contains line separated json. This script is used in conjunction with -create-lambda-function.py, which expectes the Kinesis stream to provide the +create-lambda-function.py, which expects the Kinesis stream to provide the input on which predictions are made. All json data being pushed to kinesis is first converted to string to string key value pairs as that is the expected format by Amazon Machine Learning. diff --git a/targeted-marketing-java/src/main/java/com/amazonaws/samples/machinelearning/UseModel.java b/targeted-marketing-java/src/main/java/com/amazonaws/samples/machinelearning/UseModel.java index b8798f9..1b72b02 100644 --- a/targeted-marketing-java/src/main/java/com/amazonaws/samples/machinelearning/UseModel.java +++ b/targeted-marketing-java/src/main/java/com/amazonaws/samples/machinelearning/UseModel.java @@ -54,7 +54,7 @@ public static void main(String[] args) throws IOException { /** * @param args command-line arguments: * mlModelid - * score threshhold + * score threshold * s3:// url where output should go */ public UseModel(String[] args) { diff --git a/targeted-marketing-python/use_model.py b/targeted-marketing-python/use_model.py index 74a7475..df4e35d 100755 --- a/targeted-marketing-python/use_model.py +++ b/targeted-marketing-python/use_model.py @@ -17,7 +17,7 @@ generate predictions on new data. This script needs the id of the ML Model to use. It also requires the score threshold. -Useage: +Usage: python use_model.py ml_model_id score_threshold s3_output_url For example: diff --git a/targeted-marketing-scala/src/main/scala/com/amazonaws/samples/machinelearning/UserModel.scala b/targeted-marketing-scala/src/main/scala/com/amazonaws/samples/machinelearning/UserModel.scala index fb4d492..133606a 100644 --- a/targeted-marketing-scala/src/main/scala/com/amazonaws/samples/machinelearning/UserModel.scala +++ b/targeted-marketing-scala/src/main/scala/com/amazonaws/samples/machinelearning/UserModel.scala @@ -14,13 +14,13 @@ import scala.io.Source * to make batch predictions. * * command-line arguments: - * mlModelid scoreThreshhold s3://url-where-output-should-go + * mlModelid scoreThreshold s3://url-where-output-should-go */ object UserModel extends App { val unscoredDataUrl = "s3://aml-sample-data/banking-batch.csv" val dataSchema = getClass.getResourceAsStream("/banking-batch.csv.schema") - require(args.length == 3, "command-line arguments: mlModelid scoreThreshhold s3://url-where-output-should-go") + require(args.length == 3, "command-line arguments: mlModelid scoreThreshold s3://url-where-output-should-go") val mlModelId = args(0) val threshold = args(1).toFloat val s3OutputUrl = args(2)