diff --git a/source/.gitignore b/source/.gitignore index e52473bf..67d098a6 100644 --- a/source/.gitignore +++ b/source/.gitignore @@ -5,7 +5,6 @@ _nb_*/ *.blg *_bibertool.bib _bookdown.yml -*.md !README.md libs/ @@ -20,3 +19,6 @@ build.ninja source/*.png diagrams/*.png _marked_*.Rmd + +/_build/ +node_modules diff --git a/source/myst.yml b/source/myst.yml new file mode 100644 index 00000000..a9c16d70 --- /dev/null +++ b/source/myst.yml @@ -0,0 +1,18 @@ +# See docs at: https://mystmd.org/guide/frontmatter +version: 1 +project: + id: da5a80ed-8df5-4893-8121-734298c02763 + # title: + # description: + # keywords: [] + # authors: [] + github: https://github.com/resampling-stats/resampling-with + # To autogenerate a Table of Contents, run "myst init --write-toc" + plugins: + - plugin.mjs + abbreviations: + NCI: National Cancer Institute + numbering: + headings: true +site: + template: book-theme diff --git a/source/package-lock.json b/source/package-lock.json new file mode 100644 index 00000000..416afe05 --- /dev/null +++ b/source/package-lock.json @@ -0,0 +1,346 @@ +{ + "name": "source", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "dependencies": { + "myst-common": "^1.4.7" + } + }, + "node_modules/@types/unist": { + "version": "2.0.10", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.10.tgz", + "integrity": "sha512-IfYcSBWE3hLpBg8+X2SEa8LVkJdJEkT2Ese2aaLs3ptGdVtABxndrMaxuFlQ1qdFf9Q5rDvDpxI3WwgvKFAsQA==" + }, + "node_modules/bail": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz", + "integrity": "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw==", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==" + }, + "node_modules/credit-roles": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/credit-roles/-/credit-roles-2.1.0.tgz", + "integrity": "sha512-wt1jw7lDnzY1Ob4cDHpXboN4Bfu6l7reKal0zxtKnxogqw916l+Iu862LxIze4U4y44ssAd0TOQqVg2cXY2V6Q==" + }, + "node_modules/css-selector-parser": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/css-selector-parser/-/css-selector-parser-1.4.1.tgz", + "integrity": "sha512-HYPSb7y/Z7BNDCOrakL4raGO2zltZkbeXyAd6Tg9obzix6QhzxCotdBl6VT0Dv4vZfJGVz3WL/xaEI9Ly3ul0g==" + }, + "node_modules/doi-utils": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/doi-utils/-/doi-utils-2.0.3.tgz", + "integrity": "sha512-vXWjB8Wz195m0PSw9jCwkCVBtPO+yIY/iPUQ1qTLD3GZ7alXreaoLKA3D4MAewPOBt+D8BU1LJiOF7WKpy9ifQ==" + }, + "node_modules/extend": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", + "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==" + }, + "node_modules/is-buffer": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.5.tgz", + "integrity": "sha512-i2R6zNFDwgEHJyQUtJEk0XFi1i0dPFn/oqjK3/vPCcDeJvW5NQ83V8QbicfF1SupOaB0h8ntgBC2YiE7dfyctQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "engines": { + "node": ">=4" + } + }, + "node_modules/is-plain-obj": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz", + "integrity": "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/mdast": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/mdast/-/mdast-3.0.0.tgz", + "integrity": "sha512-xySmf8g4fPKMeC07jXGz971EkLbWAJ83s4US2Tj9lEdnZ142UP5grN73H1Xd3HzrdbU5o9GYYP/y8F9ZSwLE9g==", + "deprecated": "`mdast` was renamed to `remark`" + }, + "node_modules/myst-common": { + "version": "1.4.7", + "resolved": "https://registry.npmjs.org/myst-common/-/myst-common-1.4.7.tgz", + "integrity": "sha512-1y87JkWMEYfR7N06h8qo+RjoZBGQcX1HEvbcAJmvGln7lAa+3pFO5Yyy2gLd7X/mBLvCS5fDl9J3uWu0Yin+SQ==", + "dependencies": { + "mdast": "^3.0.0", + "myst-frontmatter": "^1.4.7", + "myst-spec": "^0.0.5", + "nanoid": "^4.0.0", + "unified": "^10.1.2", + "unist-util-remove": "^3.1.0", + "unist-util-select": "^4.0.3", + "unist-util-visit": "^4.1.2", + "vfile": "^5.0.0", + "vfile-message": "^3.0.0" + } + }, + "node_modules/myst-frontmatter": { + "version": "1.4.7", + "resolved": "https://registry.npmjs.org/myst-frontmatter/-/myst-frontmatter-1.4.7.tgz", + "integrity": "sha512-4LVhHAdYiSzc56lo6ZfJrlDO9BLcr6kc6Q7/uBK13+8fVw2R5j8+RrvtBtAacIqBXKSWpQvV3gCN3OXhliM5fg==", + "dependencies": { + "credit-roles": "^2.1.0", + "doi-utils": "^2.0.0", + "myst-toc": "^0.1.1", + "orcid": "^1.0.0", + "simple-validators": "^1.0.5", + "spdx-correct": "^3.2.0" + } + }, + "node_modules/myst-spec": { + "version": "0.0.5", + "resolved": "https://registry.npmjs.org/myst-spec/-/myst-spec-0.0.5.tgz", + "integrity": "sha512-L/4TV1l5ZbWUOgSnXqiYrx192SV4I+HqjX7TBQ4k02/heeNFckpkUIyLulraap5heTyLcJs8UYBxu+Kv5JiiRw==" + }, + "node_modules/myst-toc": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/myst-toc/-/myst-toc-0.1.1.tgz", + "integrity": "sha512-0kYzFvKkw0M/Awjc3VOnJG+NFRYZwVk2exjpRvoegC9E1ODZqsfMlbsZmWsNKmZVVe2HIMEY++Eg0yswoMoY/g==", + "dependencies": { + "simple-validators": "^1.0.4" + } + }, + "node_modules/nanoid": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-4.0.2.tgz", + "integrity": "sha512-7ZtY5KTCNheRGfEFxnedV5zFiORN1+Y1N6zvPTnHQd8ENUvfaDBeuJDZb2bN/oXwXxu3qkTXDzy57W5vAmDTBw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "bin": { + "nanoid": "bin/nanoid.js" + }, + "engines": { + "node": "^14 || ^16 || >=18" + } + }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, + "node_modules/orcid": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/orcid/-/orcid-1.0.0.tgz", + "integrity": "sha512-zdRFh+1FAZt0Vy9XaKNu+5h4zXC2pkunZH8MV7tSGoq54skik8nxn5ks3E9f33m+tPxuliSRzEHz7owb1Z0HYg==", + "bin": { + "orcid": "dist/orcid.cjs" + } + }, + "node_modules/simple-validators": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/simple-validators/-/simple-validators-1.0.5.tgz", + "integrity": "sha512-s4RGt469rRossOfo+x6DxA1UgIIvRKaVjikb61quZqukWJa7xVOF0GdRCTFB3w8GVDdK16HvMjIjcED5CAmxvQ==" + }, + "node_modules/spdx-correct": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.2.0.tgz", + "integrity": "sha512-kN9dJbvnySHULIluDHy32WHRUu3Og7B9sbY7tsFLctQkIqnMh3hErYgdMjTYuqmcXX+lK5T1lnUt3G7zNswmZA==", + "dependencies": { + "spdx-expression-parse": "^3.0.0", + "spdx-license-ids": "^3.0.0" + } + }, + "node_modules/spdx-exceptions": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/spdx-exceptions/-/spdx-exceptions-2.5.0.tgz", + "integrity": "sha512-PiU42r+xO4UbUS1buo3LPJkjlO7430Xn5SVAhdpzzsPHsjbYVflnnFdATgabnLude+Cqu25p6N+g2lw/PFsa4w==" + }, + "node_modules/spdx-expression-parse": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/spdx-expression-parse/-/spdx-expression-parse-3.0.1.tgz", + "integrity": "sha512-cbqHunsQWnJNE6KhVSMsMeH5H/L9EpymbzqTQ3uLwNCLZ1Q481oWaofqH7nO6V07xlXwY6PhQdQ2IedWx/ZK4Q==", + "dependencies": { + "spdx-exceptions": "^2.1.0", + "spdx-license-ids": "^3.0.0" + } + }, + "node_modules/spdx-license-ids": { + "version": "3.0.18", + "resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-3.0.18.tgz", + "integrity": "sha512-xxRs31BqRYHwiMzudOrpSiHtZ8i/GeionCBDSilhYRj+9gIcI8wCZTlXZKu9vZIVqViP3dcp9qE5G6AlIaD+TQ==" + }, + "node_modules/trough": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/trough/-/trough-2.2.0.tgz", + "integrity": "sha512-tmMpK00BjZiUyVyvrBK7knerNgmgvcV/KLVyuma/SC+TQN167GrMRciANTz09+k3zW8L8t60jWO1GpfkZdjTaw==", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/unified": { + "version": "10.1.2", + "resolved": "https://registry.npmjs.org/unified/-/unified-10.1.2.tgz", + "integrity": "sha512-pUSWAi/RAnVy1Pif2kAoeWNBa3JVrx0MId2LASj8G+7AiHWoKZNTomq6LG326T68U7/e263X6fTdcXIy7XnF7Q==", + "dependencies": { + "@types/unist": "^2.0.0", + "bail": "^2.0.0", + "extend": "^3.0.0", + "is-buffer": "^2.0.0", + "is-plain-obj": "^4.0.0", + "trough": "^2.0.0", + "vfile": "^5.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-is": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-5.2.1.tgz", + "integrity": "sha512-u9njyyfEh43npf1M+yGKDGVPbY/JWEemg5nH05ncKPfi+kBbKBJoTdsogMu33uhytuLlv9y0O7GH7fEdwLdLQw==", + "dependencies": { + "@types/unist": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-remove": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/unist-util-remove/-/unist-util-remove-3.1.1.tgz", + "integrity": "sha512-kfCqZK5YVY5yEa89tvpl7KnBBHu2c6CzMkqHUrlOqaRgGOMp0sMvwWOVrbAtj03KhovQB7i96Gda72v/EFE0vw==", + "dependencies": { + "@types/unist": "^2.0.0", + "unist-util-is": "^5.0.0", + "unist-util-visit-parents": "^5.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-select": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/unist-util-select/-/unist-util-select-4.0.3.tgz", + "integrity": "sha512-1074+K9VyR3NyUz3lgNtHKm7ln+jSZXtLJM4E22uVuoFn88a/Go2pX8dusrt/W+KWH1ncn8jcd8uCQuvXb/fXA==", + "dependencies": { + "@types/unist": "^2.0.0", + "css-selector-parser": "^1.0.0", + "nth-check": "^2.0.0", + "zwitch": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-stringify-position": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-3.0.3.tgz", + "integrity": "sha512-k5GzIBZ/QatR8N5X2y+drfpWG8IDBzdnVj6OInRNWm1oXrzydiaAT2OQiA8DPRRZyAKb9b6I2a6PxYklZD0gKg==", + "dependencies": { + "@types/unist": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-visit": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-4.1.2.tgz", + "integrity": "sha512-MSd8OUGISqHdVvfY9TPhyK2VdUrPgxkUtWSuMHF6XAAFuL4LokseigBnZtPnJMu+FbynTkFNnFlyjxpVKujMRg==", + "dependencies": { + "@types/unist": "^2.0.0", + "unist-util-is": "^5.0.0", + "unist-util-visit-parents": "^5.1.1" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-visit-parents": { + "version": "5.1.3", + "resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-5.1.3.tgz", + "integrity": "sha512-x6+y8g7wWMyQhL1iZfhIPhDAs7Xwbn9nRosDXl7qoPTSCy0yNxnKc+hWokFifWQIDGi154rdUqKvbCa4+1kLhg==", + "dependencies": { + "@types/unist": "^2.0.0", + "unist-util-is": "^5.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/vfile": { + "version": "5.3.7", + "resolved": "https://registry.npmjs.org/vfile/-/vfile-5.3.7.tgz", + "integrity": "sha512-r7qlzkgErKjobAmyNIkkSpizsFPYiUPuJb5pNW1RB4JcYVZhs4lIbVqk8XPk033CV/1z8ss5pkax8SuhGpcG8g==", + "dependencies": { + "@types/unist": "^2.0.0", + "is-buffer": "^2.0.0", + "unist-util-stringify-position": "^3.0.0", + "vfile-message": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/vfile-message": { + "version": "3.1.4", + "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-3.1.4.tgz", + "integrity": "sha512-fa0Z6P8HUrQN4BZaX05SIVXic+7kE3b05PWAtPuYP9QLHsLKYR7/AlLW3NtOrpXRLeawpDLMsVkmk5DG0NXgWw==", + "dependencies": { + "@types/unist": "^2.0.0", + "unist-util-stringify-position": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/zwitch": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/zwitch/-/zwitch-2.0.4.tgz", + "integrity": "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + } + } +} diff --git a/source/package.json b/source/package.json new file mode 100644 index 00000000..0ddb35ea --- /dev/null +++ b/source/package.json @@ -0,0 +1,5 @@ +{ + "dependencies": { + "myst-common": "^1.4.7" + } +} diff --git a/source/plugin.mjs b/source/plugin.mjs new file mode 100644 index 00000000..a7bee28c --- /dev/null +++ b/source/plugin.mjs @@ -0,0 +1,145 @@ +import { fileError, liftChildren, NotebookCell } from 'myst-common'; +import { remove } from 'unist-util-remove'; + +const MODE = 'python'; + +const R_VARS = { + lang: 'R', + np_or_r: 'R', + other_lang: 'Python', + cell: 'chunk', + nb_app: 'RStudio', + nb_fmt: 'RMarkdown', + run_key: 'Ctl/Cmd-Shift-Enter', + array: 'vector', + an_array: 'a vector', + true: '`TRUE`', + false: '`FALSE`', + sample: '`sample`', + bincount: '`tabulate`', + sum: '`sum`', +}; +const PYTHON_VARS = { + lang: 'Python', + np_or_r: 'NumPy', + other_lang: 'R', + cell: 'cell', + nb_app: 'Jupyter', + nb_fmt: 'Jupyter', + run_key: 'Shift-Enter', + array: 'array', + an_array: 'an array', + true: [{ type: 'inlineCode', value: 'True' }], + false: [{ type: 'inlineCode', value: 'False' }], + sample: [{ type: 'inlineCode', value: 'rnd.choice' }], + bincount: [{ type: 'inlineCode', value: 'np.bincount' }], + sum: [{ type: 'inlineCode', value: 'np.sum' }], +}; + +/** + * Create a documentation section for a directive + * + * @type {import('myst-common').RoleSpec} + */ +const varShortCode = { + name: 'var', + body: { + type: String, + required: true, + }, + run(data, vfile) { + const replacement = PYTHON_VARS[data.body]; + if (!replacement) { + fileError(vfile, `shortcode: "var", unknown replacement "${data.body}"`, { node: data.node }); + return; + } + if (typeof replacement === 'string') { + return [{ type: 'text', value: replacement }]; + } + return replacement; + }, +}; + +/** + * Create a documentation section for a directive + * + * @type {import('myst-common').RoleSpec} + */ +const spanRole = { + name: 'span', + body: { + type: String, + required: true, + }, + run(data, vfile) { + if (data.node.options?.class !== MODE) return []; + return [{ type: 'span', children: data.node.children[0].children }]; + }, +}; + +const langBlock = { + name: 'lang-block', + alias: ['python', 'r'], + body: { + type: String, + required: true, + }, + run(data, vfile) { + const name = data.name; + if (name !== MODE) return []; + const code = { + type: 'code', + lang: data.name, + executable: true, + value: data.body ?? '', + }; + const output = { + type: 'output', + data: [], + }; + const block = { + type: 'block', + kind: NotebookCell.code, + children: [code, output], + data: {}, + }; + return [block]; + }, +}; + +/** + * Create a documentation section for a directive + * + * @type {import('myst-common').TransformSpec} + */ +const divTransform = { + name: 'Hide Divs', + stage: 'document', + plugin: (opts, utils) => (mdast) => { + const divs = utils.selectAll('div', mdast); + divs.map((div) => { + if (div.class === MODE) { + div.type = '__lift__'; + } else { + div.type = '__remove__'; + } + }); + remove(mdast, '__remove__'); + liftChildren(mdast, '__lift__'); + return mdast; + }, +}; + +/** + * @type {import('myst-common').MystPlugin} + */ +const plugin = { + name: 'Short Code Plugins', + author: 'Rowan Cockett', + license: 'MIT', + directives: [langBlock], + roles: [varShortCode, spanRole], + transforms: [divTransform], +}; + +export default plugin; diff --git a/source/resampling_with_code.Rmd b/source/resampling_with_code.md similarity index 69% rename from source/resampling_with_code.Rmd rename to source/resampling_with_code.md index f8fa8bb4..ec491ec6 100644 --- a/source/resampling_with_code.Rmd +++ b/source/resampling_with_code.md @@ -5,7 +5,7 @@ jupyter: notebook: additional: all excluded: - - language_info + - language_info text_representation: extension: .Rmd format_name: rmarkdown @@ -16,61 +16,63 @@ jupyter: language: python name: python3 resampling_with: - ed2_fname: null + ed2_fname: null +export: + - format: typst --- +# Resampling with code {#sec-resampling-code} + ```{r setup, include=FALSE} source("_common.R") ``` -# Resampling with code {#sec-resampling-code} - @sec-resampling-method used simulation and resampling from -tables of random numbers, dice, and coins. Making random choices in this way +tables of random numbers, dice, and coins. Making random choices in this way can make it easier to understand the process, but of course, physical methods of making random outcomes can be slow and boring. We saw that short computer programs can do a huge number of resampling -trials in a less than a second. The flexibility of a programming language +trials in a less than a second. The flexibility of a programming language makes it possible to simulate many different outcomes and tests. Programs can build up tables of random numbers, and do basic tasks like counting the number of values in a row or taking -proportions. With these simple tools, we can simulate many problems +proportions. With these simple tools, we can simulate many problems in probability and statistics. In this chapter, we will model another problem using {{< var lang >}}, but this chapter will add three new things. -* The problem we will work on is a little different from the ambulances problem - from @sec-resampling-method. It is a real problem about deciding whether a +- The problem we will work on is a little different from the ambulances problem + from @sec-resampling-method. It is a real problem about deciding whether a new cancer treatment is better than the alternatives, and it introduces the idea of making a model of the world, to ask questions about chances and probabilities. -* We will slow down a little to emphasize the steps in solving this kind of - problem. First we work out how to simulate a single *trial*. Then we work +- We will slow down a little to emphasize the steps in solving this kind of + problem. First we work out how to simulate a single _trial_. Then we work out how to run many simulated trials. -* We sprinted through the code in @sec-resampling-method, with the promise we - would come back to the details. Here we go into more detail about some ideas - from the code in the last chapter. These are: +- We sprinted through the code in @sec-resampling-method, with the promise we + would come back to the details. Here we go into more detail about some ideas + from the code in the last chapter. These are: - * Storing several values together in one place, with + - Storing several values together in one place, with [arrays]{.python}[vectors]{.r}. - * Using *functions* (code recipes) to apply procedures. - * *Comparing* numbers to other numbers. - * *Counting* numbers that match a condition. + - Using _functions_ (code recipes) to apply procedures. + - _Comparing_ numbers to other numbers. + - _Counting_ numbers that match a condition. -In the next chapter, we will talk more about *using {{< var array >}}s* to -store results, and *for loops* to repeat a procedure many times. +In the next chapter, we will talk more about _using {{< var array >}}s_ to +store results, and _for loops_ to repeat a procedure many times. ## Statistics and probability -We have already emphasized that *statistics* is a way of drawing conclusions +We have already emphasized that _statistics_ is a way of drawing conclusions about data from the real world, in the presence of random variation; -*probability* is the way of reasoning about random variation. This chapter -introduces our first *statistical* problem, where we use probability to draw +_probability_ is the way of reasoning about random variation. This chapter +introduces our first _statistical_ problem, where we use probability to draw conclusions about some important data — about a potential cure for a type of -cancer. We will not make much of the distinction between probability and +cancer. We will not make much of the distinction between probability and statistics here, but we will come back to it several times in later chapters. ::: todo @@ -80,7 +82,7 @@ Check we have discussed this before. ## A new treatment for Burkitt lymphoma [Burkitt lymphoma](https://en.wikipedia.org/wiki/Burkitt_lymphoma) is an -unusual cancer of the lymphatic system. The lymphatic system is a vein-like +unusual cancer of the lymphatic system. The lymphatic system is a vein-like network throughout the body that is involved in the immune reaction to disease. In developed countries, with standard treatment, the cure rate for Burkitt lymphoma is about 90%. @@ -88,18 +90,18 @@ lymphoma is about 90%. In 2006, researchers at the US National Cancer Institute (NCI), tested a new treatment for Burkitt lymphoma [@dunleavy2006burkitt]. They gave the new treatment to 17 patients, and found that all 17 patients were doing well after -two years or more of follow up. By "doing well", we mean that their lymphoma +two years or more of follow up. By "doing well", we mean that their lymphoma had not progressed; as a short-hand, we will say that these patients were "cured", but of course, we do not know what happened to them after this follow up. Here is where we put on our statistical hat and ask ourselves the following -question — *how surprised are we that the NCI researchers saw their result of -17 out of 17 patients cured?* +question — _how surprised are we that the NCI researchers saw their result of +17 out of 17 patients cured?_ At this stage you might and should ask, what could we possibly mean by -"surprised"? That is a good and important question, and we will discuss that -much more in the chapters to come. For now, please bear with us as we do a +"surprised"? That is a good and important question, and we will discuss that +much more in the chapters to come. For now, please bear with us as we do a thought experiment. Let us forget the 17 out of 17 result of the NCI study for a moment. Imagine @@ -109,29 +111,29 @@ lymphoma. Saint Hypothetical were not using the NCI treatment, they were using the standard treatment. We already know that each patient given the standard treatment has a 90% chance -of cure. Given that 90% cure rate, what is the chance that 17 out of 17 of the +of cure. Given that 90% cure rate, what is the chance that 17 out of 17 of the Hypothetical group will be cured? You may notice that this question about the Hypothetical group is similar to the problem of the 20 ambulances in Chapter @sec-resampling-method. In that problem, we were interested to know how likely it was that 3 or more of 20 ambulances would be out of action on any one day, given that each ambulance had -a 10% chance of being out of action. Here we would like to know the chances +a 10% chance of being out of action. Here we would like to know the chances that all 17 patients would be cured, given that each patient has a 90% chance of being cured. ## A physical model of the hypothetical hospital As in the ambulance example, we could make a physical model of chance in -this world. For example, to simulate whether a given patient is cured or not +this world. For example, to simulate whether a given patient is cured or not by a 90% effective treatment, we could throw a ten sided die and record the -result. We could say, arbitrarily, that a result of 0 means "not cured", and +result. We could say, arbitrarily, that a result of 0 means "not cured", and all the numbers 1 through 9 mean "cured" (typical 10-sided dice have sides numbered 0 through 9). -We could roll 17 dice to simulate one "trial" in this random world. For each +We could roll 17 dice to simulate one "trial" in this random world. For each trial, we record the number of dice that show numbers 1 through 9 (and not -0). This will be a number between 0 and 17, and it is the number of patients +0). This will be a number between 0 and 17, and it is the number of patients "cured" in our simulated trial. @fig-17-d10s is the result of one such trial we did with a set of 17 10-sided @@ -140,7 +142,7 @@ dice we happened to have to hand: ![One roll of 17 10-sided dice](images/17_d10s.png){#fig-17-d10s} The trial in @fig-17-d10s shows are four dice with the 0 face uppermost, and -the rest with numbers from 1 through 9. Therefore, there were 13 out of 17 +the rest with numbers from 1 through 9. Therefore, there were 13 out of 17 not-zero numbers, meaning that 13 out of 17 simulated "patients" were "cured" in this simulated trial. @@ -150,45 +152,44 @@ Check we have adapted text above to picture. We could repeat this simulated trial procedure 100 times, and we would then have 100 counts of the not-zero numbers. Each of the 100 counts would be the -number of patients cured in that trial. We can ask how many of these 100 -counts were equal to 17. This will give us an estimate of the probability we +number of patients cured in that trial. We can ask how many of these 100 +counts were equal to 17. This will give us an estimate of the probability we would see 17 out of 17 patients cured, given that any one patient has a 90% -chance of cure. For example, say we saw 15 out of 100 counts were equal to -17. That would give us an estimate of 15 / 100 or 0.15 or 15%, for the +chance of cure. For example, say we saw 15 out of 100 counts were equal to 17. That would give us an estimate of 15 / 100 or 0.15 or 15%, for the probability we would see 17 out of 17 patients cured. So, if Saint Hypothetical General did see 17 out of 17 patients cured with the standard treatment, they would be a little surprised, because they would only -expect to see that happen 15% of the time. But they would not be *very* -surprised — 15% of the time is uncommon, but not *very* uncommon. +expect to see that happen 15% of the time. But they would not be _very_ +surprised — 15% of the time is uncommon, but not _very_ uncommon. ## A trial, a run, a count and a proportion Here we stop to emphasize the steps in the process of a random simulation. -1. *We decide what we mean by one trial*. Here one trial has the same meaning +1. _We decide what we mean by one trial_. Here one trial has the same meaning in medicine as resampling — we mean the result of treating 17 patients. One - *simulated trial* is then the simulation of one set of outcomes from 17 + _simulated trial_ is then the simulation of one set of outcomes from 17 patients. -2. Work out the *outcome* of interest from the trial. The outcome here is the +2. Work out the _outcome_ of interest from the trial. The outcome here is the number of patients cured. -3. We work out a way to *simulate one trial*. Here we chose to throw 17 - 10-sided dice, and count the number of not zero values. This is the outcome +3. We work out a way to _simulate one trial_. Here we chose to throw 17 + 10-sided dice, and count the number of not zero values. This is the outcome from one simulation trial. -4. We *repeat* the simulated trial procedure many times, and collect the - results from each trial. Say we repeat the trial procedure 100 times; we - will call this a *run* of 100 trials. -5. We *count* the number of trials with an outcome that matches the outcome we +4. We _repeat_ the simulated trial procedure many times, and collect the + results from each trial. Say we repeat the trial procedure 100 times; we + will call this a _run_ of 100 trials. +5. We _count_ the number of trials with an outcome that matches the outcome we are interested in. In this case we are interested in the outcome 17 out of - 17 cured, so we count the number of trials with a score of 17. Say 15 out - of the run of 100 trials had an outcome of 17 cured. That is our *count*. -6. Finally we divide the count by the number of trials to get the *proportion*. - From the example above, we divide 15 by 100 to 0.15 (15%). This is our + 17 cured, so we count the number of trials with a score of 17. Say 15 out + of the run of 100 trials had an outcome of 17 cured. That is our _count_. +6. Finally we divide the count by the number of trials to get the _proportion_. + From the example above, we divide 15 by 100 to 0.15 (15%). This is our estimate of the chance of seeing 17 out of 17 patients cured in any one - trial. We can also call this an estimate of the *probability* that 17 out + trial. We can also call this an estimate of the _probability_ that 17 out of 17 patients will be cured on any on trial. -Our next step is to work out the code for step 2: *simulate one trial*. +Our next step is to work out the code for step 2: _simulate one trial_. ## Simulate one trial with code @@ -196,20 +197,21 @@ We can use the computer to do something very similar to rolling 17 10-sided dice, by asking the computer for 17 random whole numbers from 0 through 9. :::{.callout-note} + ## Whole numbers A whole number is a number that is not negative, and does not have fractional -part (does not have anything after a decimal point). 0 and 1 and 2 and 3 are +part (does not have anything after a decimal point). 0 and 1 and 2 and 3 are whole numbers, but -1 and $\frac{3}{5}$ and 11.3 are not. The whole numbers from 0 through 9 are 0, 1, 2, 3, 4, 5, 6, 7, 8, 9. ::: -We have already discussed what we mean by *random* in +We have already discussed what we mean by _random_ in @sec-randomness-computer. ::: python -We will be asking the computer to generate many random numbers. So, before we -start, we again import NumPy and get its *random number generator*: +We will be asking the computer to generate many random numbers. So, before we +start, we again import NumPy and get its _random number generator_: ```{python} import numpy as np @@ -218,15 +220,16 @@ import numpy as np # it `rnd`. `rnd` is short for "random". rnd = np.random.default_rng() ``` + ::: ## From numbers to {{< var array >}}s -We [next]{.python} need to prepare the *sequence* of numbers that we want +We [next]{.python} need to prepare the _sequence_ of numbers that we want {{< var np_or_r >}} to select from. -We have already seen the idea that {{< var lang >}} has *values* that are -individual numbers. Remember, a *variable* is a *named value*. Here we +We have already seen the idea that {{< var lang >}} has _values_ that are +individual numbers. Remember, a _variable_ is a _named value_. Here we attach the name `a` to the value 1. ```{python} @@ -241,12 +244,12 @@ a <- 1 a ``` -{{< var np_or_r >}} also allows *values* that are *sequences of numbers*. {{< -var np_or_r >}} calls these sequences *{{< var array >}}s*. +{{< var np_or_r >}} also allows _values_ that are _sequences of numbers_. {{< +var np_or_r >}} calls these sequences _{{< var array >}}s_. ::: r -The name *vector* sounds rather technical and mathematical, but the only -important idea for us is that a *vector* stores a *sequence* of numbers. +The name _vector_ sounds rather technical and mathematical, but the only +important idea for us is that a _vector_ stores a _sequence_ of numbers. ::: Here we make a {{< var array >}} that contains the 10 numbers we will @@ -267,7 +270,7 @@ some_numbers ``` Notice that the value for `some_numbers` is {{< var an_array >}}, -and that this value *contains* 10 numbers. +and that this value _contains_ 10 numbers. Put another way, `some_numbers` is now the name we can use for this collection of 10 values. @@ -277,14 +280,11 @@ analysis, and we will be using these for nearly every example in this book. ## Functions {#sec-introducing-functions} -*Functions* are another tool that we will be using everywhere, and that you seen already, although we have not introduced them until now. - -You can think of functions as named *production lines*. +_Functions_ are another tool that we will be using everywhere, and that you seen already, although we have not introduced them until now. -For example, consider the {{< var lang >}} *function* [`round`]{.r}[`np.round`]{.python} +You can think of functions as named _production lines_. -::: python -::: +For example, consider the {{< var lang >}} _function_ [`round`]{.r}[`np.round`]{.python} ```{python} # We load the Numpy library so we have access to the Numpy functions. @@ -293,46 +293,49 @@ import numpy as np [`round`]{.r}[`np.round`]{.python} is the name for a simple production line, that takes in a number, and (by default) sends back the number rounded to the -nearest *integer*. +nearest _integer_. :::{.callout-note} + ## What is an integer? -An *integer* is a positive or negative *whole number*. +An _integer_ is a positive or negative _whole number_. -In other words, a number is an *integer* if the number is *either* a whole -number (0, 1, 2 ...), *or* a negative whole number (-1, -2, -3 ...). All of +In other words, a number is an _integer_ if the number is _either_ a whole +number (0, 1, 2 ...), _or_ a negative whole number (-1, -2, -3 ...). All of -208, -2, 0, 10, 105 are integers, but $\frac{3}{5}$, -10.3 and 0.2 are not. -We will use the term *integer* fairly often, because it is a convenient way to +We will use the term _integer_ fairly often, because it is a convenient way to name all the positive and negative whole numbers. ::: -Think of a function as a named *production line*. We sent the function -(production line) raw material (components) to work on. The production line +Think of a function as a named _production line_. We sent the function +(production line) raw material (components) to work on. The production line does some work on the components. A finished result comes off the other end. Therefore, think of [`round`]{.r}[`np.round`]{.python} as the name of a -production line, that takes in a *component* (in this case, any number), and -does some work, and sends back the finished *result* (in this case, the number +production line, that takes in a _component_ (in this case, any number), and +does some work, and sends back the finished _result_ (in this case, the number rounded to the nearest integer. -The components we send to a function are called *arguments*. The finished -result the function sends back is the *return value*. +The components we send to a function are called _arguments_. The finished +result the function sends back is the _return value_. -* **Arguments** : the value or values we send to a function. -* **Return value** : the values the function sends back. +- **Arguments** : the value or values we send to a function. +- **Return value** : the values the function sends back. See @fig-round_function_pl for an illustration of [`round`]{.r}[`np.round`]{.python} as a production line. -```{r fig-round_function_pl, opts.label='svg_fig', fig.cap="The `round` function as a production line"} -include_svg('diagrams/round_function_pl.svg') +```{figure} diagrams/round_function_pl.svg +:label: fig-round_function_pl +:width: 50% +The `round` function as a production line ``` In the next few code {{< var cell >}}s, you see examples where [`round`]{.r}[`np.round`]{.python} takes in a not-integer number, as an -*argument*, and sends back the nearest integer as the *return value*: +_argument_, and sends back the nearest integer as the _return value_: ```{python} # Put in 3.2, round sends back 3. @@ -355,7 +358,7 @@ round(-2.7) ``` Like many functions, [`round`]{.r}[`np.round`]{.python} can take more than one -argument (component). You can send `range` the number of digits you want to +argument (component). You can send `range` the number of digits you want to round to, after the number of you want it to work on, like this (see @fig-round_ndigits_pl): @@ -371,27 +374,29 @@ np.round(3.1415, 2) round(3.1415, 2) ``` -```{r fig-round_ndigits_pl, opts.label='svg_fig', fig.cap="`round` with optional arguments specifying number of digits"} -include_svg('diagrams/round_function_ndigits_pl.svg') +```{figure} diagrams/round_function_ndigits_pl.svg +:label: fig-round_ndigits_pl +:width: 50% +`round` with optional arguments specifying number of digits ``` -Notice that the second argument — here 2 — is *optional*. We only have to send -`round` one argument: the number we want it to round. But we can *optionally* +Notice that the second argument — here 2 — is _optional_. We only have to send +`round` one argument: the number we want it to round. But we can _optionally_ send it a second argument — the number of decimal places we want it to round -to. If we don't specify the second argument, then `round` assumes we want to +to. If we don't specify the second argument, then `round` assumes we want to round to 0 decimal places, and therefore, to the nearest integer. ## Functions and named arguments {#sec-named-arguments} In the example above, we sent `round` two arguments. `round` knows that we mean the first argument to be the number we want to round, and the second argument -is the number of decimal places we want to round to. It knows which is which -by the *position* of the arguments — the *first* argument is the *number* it -should round, and *second* is the number of digits. +is the number of decimal places we want to round to. It knows which is which +by the _position_ of the arguments — the _first_ argument is the _number_ it +should round, and _second_ is the number of digits. -In fact, internally, the `round` function also gives these arguments *names*. +In fact, internally, the `round` function also gives these arguments _names_. It calls the number it should round — [`a`]{.python}[`x`]{.r} — and the number -of digits it should round to — [`digits`]{.r}[`decimals`]{.python}. This is +of digits it should round to — [`digits`]{.r}[`decimals`]{.python}. This is useful, because it is often clearer and simpler to identify the argument we are specifying with its name, instead of just relying on its position. @@ -412,7 +417,7 @@ round(3.1415, 2) In this call, we relied on the fact that we, the people writing the code, and you, the person reading the code, remembers that the second argument (2) means -the number of decimal places it should round to. But, we can also specify the +the number of decimal places it should round to. But, we can also specify the argument using its name, like this (see [@fig-round_function_named]{.r}[@fig-np_round_function_named]{.python}): @@ -428,19 +433,17 @@ np.round(3.1415, decimals=2) round(3.1415, digits=2) ``` -```{r fig-round_function_named, opts.label='svg_fig', fig.cap="The `round` function with argument names"} -include_svg('diagrams/round_function_named.svg') +```{figure} diagrams/np_round_function_named.svg +:label: fig-np_round_function_named +:width: 50% +The `np.round` function with argument names ``` -```{r fig-np_round_function_named, opts.label='svg_fig', fig.cap="The `np.round` function with argument names"} -include_svg('diagrams/np_round_function_named.svg') -``` +Here {{< var lang >}} sees the _first_ argument, as before, and assumes that it +is the number we want to round. Then it sees the second, named argument — +[`digits=2`]{.r}[`decimals=2`]{.python} — and knows, _from the name_, that we mean this to be the number of decimals to round to. -Here {{< var lang >}} sees the *first* argument, as before, and assumes that it -is the number we want to round. Then it sees the second, named argument — -[`digits=2`]{.r}[`decimals=2`]{.python} — and knows, *from the name*, that we mean this to be the number of decimals to round to. - -In fact, we could even specify *both* arguments by name, like this: +In fact, we could even specify _both_ arguments by name, like this: ```{python} # Put in 3.1415, and the number of digits to round to (2). @@ -463,19 +466,18 @@ for your reader, where your most important reader may be you, coming back to the code in a week or a year. :::{.callout-note} + ## How do you know what names to use for the function arguments? You can find the names of the function arguments in the help for the function, -either online, or in the notebook interface. For example, to get the help for +either online, or in the notebook interface. For example, to get the help for [`round`]{.r}[`np.round`]{.python}, including the argument names, you could make a new {{< var cell >}}, and type [`?round`]{.r}[`np.round?`]{.python}, then execute the cell [by running the chunk]{.r}[by pressing -Shift-Enter]{.python}. This will show the help for the function in the +Shift-Enter]{.python}. This will show the help for the function in the notebook interface. ::: - - ## Ranges {#sec-ranges} Now let us return to the variable `some_numbers` that we created above: @@ -494,21 +496,22 @@ some_numbers <- c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9) some_numbers ``` -In fact, we often need to do this: generate a sequence or *range* of integers, +In fact, we often need to do this: generate a sequence or _range_ of integers, such as 0 through 9. ::: {.callout-note} + ### Pick a number from 1 through 5 Ranges can be confusing in normal speech because it is not always clear whether -they include their beginning and end. For example, if someone says "pick a -number between 1 and 5", do they mean *all* the numbers, including the first -and last (any of 1 or 2 or 3 or 4 or 5)? Or do they mean only the numbers that -are *between* 1 and 5 (so 2 or 3 or 4)? Or do they mean all the numbers up to, +they include their beginning and end. For example, if someone says "pick a +number between 1 and 5", do they mean _all_ the numbers, including the first +and last (any of 1 or 2 or 3 or 4 or 5)? Or do they mean only the numbers that +are _between_ 1 and 5 (so 2 or 3 or 4)? Or do they mean all the numbers up to, but not including 5 (so 1 or 2 or 3 or 4)? To avoid this confusion, we will nearly always use "from" and "through" in -ranges, meaning that we do include both the start and the end number. For +ranges, meaning that we do include both the start and the end number. For example, if we say "pick a number from 1 through 5" we mean one of 1 or 2 or 3 or 4 or 5. ::: @@ -523,10 +526,10 @@ some_numbers ``` ::: python -Notice that we send `np.arange` the *arguments* 0 and 10. The first -argument, here 0, is the *start* value. The second argument, here 10, is the -*stop* value. Numpy (in the `arange` function) understands this to mean: -*start* at 0 (the start value) and go up to *but do not include* 10 (the stop +Notice that we send `np.arange` the _arguments_ 0 and 10. The first +argument, here 0, is the _start_ value. The second argument, here 10, is the +_stop_ value. Numpy (in the `arange` function) understands this to mean: +_start_ at 0 (the start value) and go up to _but do not include_ 10 (the stop value). You can therefore read `np.arange(0, 10)` as "the sequence of integers starting @@ -551,7 +554,7 @@ some_integers ``` When we sent `arange` a single argument, like this, `arange` understands this -to mean we have sent just the *stop* value, and that is should assume a *start* +to mean we have sent just the _stop_ value, and that is should assume a _start_ value of 0. Again, if we wanted, we could send this argument by name: @@ -566,7 +569,7 @@ some_integers ::: ::: r -R allows you to write a colon (`:`) between two values, to mean that you want a *vector* (sequence) that is all the integers from the first value (before the colon) *through* the second value (after the colon): +R allows you to write a colon (`:`) between two values, to mean that you want a _vector_ (sequence) that is all the integers from the first value (before the colon) _through_ the second value (after the colon): ::: ```{r} @@ -604,10 +607,10 @@ np.arange(7) ## `range` in Python {#sec-python-range} -So far you have seen ranges of integers using `np.arange`. The `np.` prefix +So far you have seen ranges of integers using `np.arange`. The `np.` prefix refers to the fact that `np.arange` is a function from the Numpy module -(library). The `a` in `arange` signals that the result `np.arange` returns is -an *array*: +(library). The `a` in `arange` signals that the result `np.arange` returns is +an _array_: ```{python} arr = np.arange(7) @@ -624,9 +627,9 @@ We do often use `np.arange` to get a range of integers in a convenient array format, but Python has another way of getting a range of integers — the `range` function. -The `range` function is very similar to `np.arange`, but it is *not* part of -Numpy — it is basic function in Python — and it does *not* return an *array* of -numbers, it returns something else. Here we ask for a `range` from 0 through 6 +The `range` function is very similar to `np.arange`, but it is _not_ part of +Numpy — it is basic function in Python — and it does _not_ return an _array_ of +numbers, it returns something else. Here we ask for a `range` from 0 through 6 (0 up to, but not including 7): ```{python} @@ -635,17 +638,17 @@ r = range(7) r ``` -Notice that the thing that came back is something that *represents* or *stands -in for* the number 0 through 6. It is not an array, but a specific type of +Notice that the thing that came back is something that _represents_ or _stands +in for_ the number 0 through 6. It is not an array, but a specific type of thing called — a `range`: ```{python} type(r) ``` -The `range` above is a container for the numbers 0 through 6. We can get the +The `range` above is a container for the numbers 0 through 6. We can get the numbers out of the container in many different ways, but one of them is to -convert this container to an array, using the `np.array` function. The +convert this container to an array, using the `np.array` function. The `np.array` function takes the thing we pass it, and makes it into an array. When we apply `np.array` to `r` above, we get the numbers that `r` contains: @@ -671,10 +674,10 @@ np.array(r_2) ``` You may reasonably ask — why do I need this `range` thing, if I have the very -similar `np.arange`? The answer is — you don't *need* `range`, and you can +similar `np.arange`? The answer is — you don't _need_ `range`, and you can always use `np.arange` where you would use `range`, but for reasons we will go into later (@sec-for-range), `range` is a good option when we want to represent -a sequence of numbers as input to a `for` loop. We cover `for` loops in more +a sequence of numbers as input to a `for` loop. We cover `for` loops in more detail in @sec-for-loops, but for now, the only thing to remember is that `range` and `np.arange` are both ways of expressing sequential ranges of integers. @@ -684,9 +687,10 @@ integers. ## Choosing values at random {#sec-random-choice} We can use the [`rnd.choice`]{.python}[`sample`]{.r} function to select a -single value *at random* from the sequence of numbers in `some_integers`. +single value _at random_ from the sequence of numbers in `some_integers`. :::{.callout-note} + ## More on {{< var sample >}} The {{< var sample >}} function will be a fundamental tool for taking many kinds of samples, and we cover it in more detail in @sec-sampling-tools. @@ -708,45 +712,54 @@ my_integer ``` Like [`np.round`]{.python}[`round`]{.r} (above), -[`rnd.choice`]{.python}[`sample`]{.r} is a *function*. +[`rnd.choice`]{.python}[`sample`]{.r} is a _function_. :::python :::{.callout-note} + ## Functions and methods -Actually, to be precise, we should call `rnd.choice` a *method*. A method is a -*function attached to a value*. In this case the function `choice` is attached -to the value `rnd`. That's not an important distinction for us at the moment, +Actually, to be precise, we should call `rnd.choice` a _method_. A method is a +_function attached to a value_. In this case the function `choice` is attached +to the value `rnd`. That's not an important distinction for us at the moment, so please forgive our strategic imprecision, and let us continue to say that `rnd.choice` is a function. ::: ::: -As you remember, a function is a named *production line*. In our case, the +As you remember, a function is a named _production line_. In our case, the production line has the name [`rnd.choice`]{.python}[the `sample` function]{.r}. We sent [`rnd.choice`]{.python}[the `sample` function]{.r}. a value to work on -— an *argument*. In this case, the argument was the value of `some_integers`. +— an _argument_. In this case, the argument was the value of `some_integers`. ::: r `sample` also needs the number of random values we should select from the first -argument. We can send the number of values we want with the *second argument*. +argument. We can send the number of values we want with the _second argument_. ::: [@fig-rnd_choice_pl]{.python}[@fig-sample_pl]{.r} is a diagram illustrating an example run of the {{< var sample >}} function (production line). ::: python -```{r fig-rnd_choice_pl, opts.label='svg_fig', fig.cap="Example run of the `rnd.choice` function"} -include_svg('diagrams/rnd_choice_pl.svg') + +```{figure} diagrams/rnd_choice_pl.svg +:label: fig-rnd_choice_pl +:width: 50% +Example run of the `rnd.choice` function ``` + ::: ::: r -```{r fig-sample_pl, opts.label='svg_fig', fig.cap="Example run of the `sample` function"} -include_svg('diagrams/sample_pl.svg') + +```{figure} diagrams/sample_pl.svg +:label: fig-sample_pl +:width: 50% +Example run of the `sample` function ``` + ::: Here is the same code again, with new comments. @@ -776,13 +789,13 @@ In the code above, we asked Python to select a single number at random — because that is what `rnd.choice` does by default. In fact, the people who wrote `rnd.choice`, wrote it to be flexible in the -work that it can do. In particular, we can tell `rnd.choice` to select *any -number of values* at random, by adding a new *argument* to the function. +work that it can do. In particular, we can tell `rnd.choice` to select _any +number of values_ at random, by adding a new _argument_ to the function. In our case, we would like Numpy to select 17 numbers at random from the sequence of `some_integers`. -To do this, we add an argument to the function that tells it *how many* numbers +To do this, we add an argument to the function that tells it _how many_ numbers we want it to select. ::: @@ -790,18 +803,18 @@ we want it to select. In the code above, we asked R to select a single number at random — by sending 1 as the second argument to the function. -As you can imagine, we can tell `sample` to select *any number of values* +As you can imagine, we can tell `sample` to select _any number of values_ at random, by changing the second argument to the function. In our case, we would like R to select 17 numbers at random from the sequence of `some_integers`. -But — there is a complication here. By default, `sample` selects numbers from -the first argument *without replacement*, meaning that, by default, sample +But — there is a complication here. By default, `sample` selects numbers from +the first argument _without replacement_, meaning that, by default, sample cannot select the same number twice, and in our case, where we want 17 numbers, that is bad, because `sample` is going to run out of numbers. To get the result we want, we must also add an extra argument: `replace=TRUE`. `replace=TRUE` -tells R to sample `some_integers` *with replacement*, where `sample` can select +tells R to sample `some_integers` _with replacement_, where `sample` can select the same number more than once in the same sample. Sampling with and without replacement is a fundamental distinction in probability and statistics. @sec-sampling-tools goes into much more detail about this, but for now, please @@ -826,13 +839,13 @@ a <- sample(some_integers, 17, replace=TRUE) a ``` -As you can see, the function sent back (returned) 17 numbers. Because it is +As you can see, the function sent back (returned) 17 numbers. Because it is sending back more than one number, the thing it sends back is {{< var an_array >}}, where the {{< var array >}} has 17 elements. ### `sum` — adding all the values -Bear with us for a short diversion. You will see why we made this diversion +Bear with us for a short diversion. You will see why we made this diversion soon. {{< var np_or_r >}} has a function [`sum`]{.r}[`np.sum`]{.python} that will @@ -841,7 +854,7 @@ add up all the numbers in {{< var an_array >}}. You can see the contents of `a` above. [`sum`]{.r}[`np.sum`]{.python} adds all the numbers in the {{< var array >}} -together, to give the *sum* of the {{< var array >}}. The *sum* is just the +together, to give the _sum_ of the {{< var array >}}. The _sum_ is just the result of adding the second element to the first, then adding third element to the result, and the fourth element to the result, and so on. @@ -860,7 +873,7 @@ is the basis for one simulated trial in the world of Saint Hypothetical General. Our next job is to get the code to count the number of numbers that are not -zero in the {{< var array >}} `a`. That will give us the number of +zero in the {{< var array >}} `a`. That will give us the number of patients who were cured in simulated trial. Another way of asking this question, is to ask how many elements in `a` are @@ -868,8 +881,8 @@ greater than zero. ### Comparison -To ask whether a number is greater than zero, we use *comparison*. Here is a -*greater than zero* comparison on a single number: +To ask whether a number is greater than zero, we use _comparison_. Here is a +_greater than zero_ comparison on a single number: ```{python} n = 5 @@ -885,10 +898,10 @@ n <- 5 n > 0 ``` -`>` is a *comparison* — it *asks a question* about the numbers either side of +`>` is a _comparison_ — it _asks a question_ about the numbers either side of it. In this case `>` is asking the question "is the value of `n` (on the left hand side) greater than 0 (on the right hand side)?" The value of `n` is 5, so -the question becomes, "is 5 greater than 0?" The answer is Yes, and {{< var +the question becomes, "is 5 greater than 0?" The answer is Yes, and {{< var lang >}} represents this Yes answer as the value {{< var true >}}. In contrast, the comparison below boils down to "is 0 greater than 0?", to @@ -909,12 +922,12 @@ p <- 0 p > 0 ``` -So far you have seen the results of comparison on a single number. Now say we -do the same comparison on {{< var an_array >}}. For example, -say we ask the question "is the value of `a` greater than 0"? Remember, `a` is -{{< var an_array >}} containing 17 values. We are comparing 17 +So far you have seen the results of comparison on a single number. Now say we +do the same comparison on {{< var an_array >}}. For example, +say we ask the question "is the value of `a` greater than 0"? Remember, `a` is +{{< var an_array >}} containing 17 values. We are comparing 17 values to one value (0). What answer do you think {{< var np_or_r >}} will -give? You may want to think a little about this before you read on. +give? You may want to think a little about this before you read on. As a reminder, here is the current value for `a`: @@ -944,10 +957,10 @@ a > 0 There are 17 values in `a`, so the comparison to 0 means there are 17 comparisons, and 17 answers. {{< var np_or_r >}} therefore returns - *{{< var an_array >}}* of 17 elements, containing these 17 -answers. The first answer is the answer to the question "is the value of the -*first* element of `a` greater than 0", and the second is the answer to "is the -value of the *second* element of `a` greater than 0". +_{{< var an_array >}}_ of 17 elements, containing these 17 +answers. The first answer is the answer to the question "is the value of the +_first_ element of `a` greater than 0", and the second is the answer to "is the +value of the _second_ element of `a` greater than 0". Let us store the result of this comparison to work on: @@ -970,7 +983,7 @@ q ## Counting {{< var true >}} values with `sum` {#sec-count-with-sum} Notice above that there is one {{< var true >}} element in `q` for every -element in `a` that was greater than 0. It only remains to *count* the number +element in `a` that was greater than 0. It only remains to _count_ the number of {{< var true >}} values in `q`, to get the count of patients in our simulated trial who were cured. @@ -1008,7 +1021,8 @@ TRUE == 1 Therefore, the function `sum`, when applied to {{< var an_array >}} of {{< var true >}} and {{< var false >}} values, will count the number of {{< var true ->}} values in the {{< var array >}}. + +> }} values in the {{< var array >}}. To see this in action we can make a new {{< var array >}} of {{< var true >}} and {{< var false >}} values, and try using @@ -1035,7 +1049,8 @@ Because {{< var true >}} counts as 1, and {{< var false >}} counts as values 1 + 0 + 1 + 1 + 0, to give 3. We can apply the same operation on `q` to count the number of {{< var true ->}} values. + +> }} values. ```{python} # Count the number of True values in "q" @@ -1087,18 +1102,18 @@ b ## Repeating the trial -Now we know how to do one simulated trial, we could just keep running the {{< var cell >}} above, and writing down the result each time. Once we had run the {{< var cell >}} 100 times, we would have 100 counts. Then we could look at the 100 counts to see how many were equal to 17 (all 17 simulated patients cured on that trial). At least that would be much faster than rolling 17 dice 100 times, but we would also like the computer to automate the process of repeating the trial, and keeping track of the counts. +Now we know how to do one simulated trial, we could just keep running the {{< var cell >}} above, and writing down the result each time. Once we had run the {{< var cell >}} 100 times, we would have 100 counts. Then we could look at the 100 counts to see how many were equal to 17 (all 17 simulated patients cured on that trial). At least that would be much faster than rolling 17 dice 100 times, but we would also like the computer to automate the process of repeating the trial, and keeping track of the counts. -Please forgive us as we race ahead again, as we did in the last chapter. As in -the last chapter, we will use a *results* {{< var array >}} called `z` to -store the count for each trial. As in the last chapter, we will use a `for` -loop to repeat the trial procedure many times. As in the last chapter, we will +Please forgive us as we race ahead again, as we did in the last chapter. As in +the last chapter, we will use a _results_ {{< var array >}} called `z` to +store the count for each trial. As in the last chapter, we will use a `for` +loop to repeat the trial procedure many times. As in the last chapter, we will not explain the counts {{< var array >}} of the `for` loop in any detail, because we are going to cover those in the next chapter. -Let us now imagine that we want to do 100 simulated trials at Saint Hypothetical General. This will give us 100 counts. We will want to store the count for each trial. +Let us now imagine that we want to do 100 simulated trials at Saint Hypothetical General. This will give us 100 counts. We will want to store the count for each trial. To do this, we make {{< var an_array >}} called `z` to hold the -100 counts. We have called the {{< var array >}} `z`, but we could have +100 counts. We have called the {{< var array >}} `z`, but we could have called it anything we liked, such as `counts` or `results` or `cecilia`. ```{python} @@ -1113,12 +1128,12 @@ z = np.zeros(100) z <- numeric(100) ``` -Next we use a `for` loop to *repeat the single trial procedure*. +Next we use a `for` loop to _repeat the single trial procedure_. Notice that the single trial procedure, inside this `for` loop, is the same as the single trial procedure above — the only two differences are: -* The trial procedure is inside the loop, and -* We are storing the count for each trial as we go. +- The trial procedure is inside the loop, and +- We are storing the count for each trial as we go. We will go into more detail on how this works in the next chapter. @@ -1170,7 +1185,7 @@ z Finally, we need to count how many of the trials results we stored in `z` gave a "cured" count of 17. -We can ask the question whether a single number is equal to 17 using the *double equals* comparison: `==`. +We can ask the question whether a single number is equal to 17 using the _double equals_ comparison: `==`. ```{python} s = 17 @@ -1186,35 +1201,37 @@ s <- 17 s == 17 ``` -::: {.callout-note} -::: Python +:::: {.callout-note} +::: python + ## Single and double equals -Notice that the *double equals* `==` means something entirely different to -Python than the single equals `=`. In the code above, Python reads `s = 17` to -mean "Set the variable `s` to have the value 17". In technical terms the -single equals is called an *assignment* operator, because it means *assign* the +Notice that the _double equals_ `==` means something entirely different to +Python than the single equals `=`. In the code above, Python reads `s = 17` to +mean "Set the variable `s` to have the value 17". In technical terms the +single equals is called an _assignment_ operator, because it means _assign_ the value 17 to the variable `s`. The code `s == 17` has a completely different meaning. ::: ::: r + ## Double equals -The *double equals* `==` above is a *comparison* in R. +The _double equals_ `==` above is a _comparison_ in R. ::: It means "give {{< var true >}} if the value in `s` is equal to 17, and {{< -var false >}} otherwise". The `==` is a *comparison* operator — it is for +var false >}} otherwise". The `==` is a _comparison_ operator — it is for comparing two values — here the value in `s` -and the value 17. This comparison, like all comparisons, returns an answer -that is either {{< var true >}} or {{< var false >}}. In our case `s` +and the value 17. This comparison, like all comparisons, returns an answer +that is either {{< var true >}} or {{< var false >}}. In our case `s` has the value 17, so the comparison becomes `17 == 17`, meaning "is 17 equal to 17?", to which the answer is "Yes", and {{< var lang >}} sends back {{< var true >}}. -:::::: +:::: -We can ask this question of *all 100 counts* by asking the question: is the {{< +We can ask this question of _all 100 counts_ by asking the question: is the {{< var array >}} `z` equal to 17, like this: ```{python} @@ -1252,7 +1269,7 @@ n_all_cured ``` `n_all_cured` is the number of simulated trials for which all patients were -cured. It only remains to get the proportion of trials for which this was +cured. It only remains to get the proportion of trials for which this was true, and to do this, we divide by the number of trials. ```{python} @@ -1279,7 +1296,7 @@ The question was — was the result of their trial — 17 out of 17 patients cur — surprising. Then, for reasons we did not explain in detail, we changed tack, and asked the -same question about a hypothetical set of 17 patients getting the *standard* +same question about a hypothetical set of 17 patients getting the _standard_ treatment in Saint Hypothetical General. That Hypothetical question turns out to be fairly easy to answer, because we @@ -1288,33 +1305,33 @@ cured in such a hypothetical trial, on the assumption that each patient has a 90% chance of being cured with the standard treatment. The answer for Saint Hypothetical General was — we would be somewhat surprised, -but not astonished. We only get 17 out of 17 patients cured about one time in +but not astonished. We only get 17 out of 17 patients cured about one time in six. -Now let us return to the NCI trial. Should the trial authors be surprised by -their results? If they assumed that their new treatment was *exactly as -effective* as the standard treatment, the result of the trial is a bit unusual, -just by chance. It is up to us to decide whether the result is unusual enough -to make us think that the actual NCI treatment might in fact have been *more -effective* than the standard treatment. +Now let us return to the NCI trial. Should the trial authors be surprised by +their results? If they assumed that their new treatment was _exactly as +effective_ as the standard treatment, the result of the trial is a bit unusual, +just by chance. It is up to us to decide whether the result is unusual enough +to make us think that the actual NCI treatment might in fact have been _more +effective_ than the standard treatment. You will see this move again and again as we go through the book. -* We take something that really happened — in this case the 17 out of 17 patients cured. -* Then we imagine a hypothetical world in which the results only depend on +- We take something that really happened — in this case the 17 out of 17 patients cured. +- Then we imagine a hypothetical world in which the results only depend on chance. -* We do simulations in that hypothetical world to see how often we get a result +- We do simulations in that hypothetical world to see how often we get a result like the one that happened in the real world. -* If the real world result (17 out of 17) is an unusual, surprising result in +- If the real world result (17 out of 17) is an unusual, surprising result in the simulations from the hypothetical world, we take that as evidence that the real world result might not be due to chance alone. -We have just described the main idea in statistical inference. If that all +We have just described the main idea in statistical inference. If that all seems strange and backwards to you, do not worry, we will go over that idea -many times in this book. It is not a simple idea to grasp in one go. We hope +many times in this book. It is not a simple idea to grasp in one go. We hope you will find that, as you do more simulations, and think of more hypothetical -worlds, the idea will start to make more sense. Later, we will start to think -about asking *other questions* about probability and chance in the real world. +worlds, the idea will start to make more sense. Later, we will start to think +about asking _other questions_ about probability and chance in the real world. ## Conclusions diff --git a/source/resampling_with_code2.md b/source/resampling_with_code2.md new file mode 100644 index 00000000..ac27ff65 --- /dev/null +++ b/source/resampling_with_code2.md @@ -0,0 +1,1056 @@ +--- +jupyter: + jupytext: + metadata_filter: + notebook: + additional: all + excluded: + - language_info + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.0' + jupytext_version: 0.8.6 + kernelspec: + display_name: Python 3 + language: python + name: python3 +resampling_with: + ed2_fname: null +--- + +```{r setup, include=FALSE} +source("_common.R") +``` + +# More resampling with code {#sec-resampling-two} + +@sec-resampling-code introduced a problem in probability, that was also a +problem in statistics. We asked how surprised we should be at the results of a +trial of a new cancer treatment regime. + +Here we study another urgent problem in the real world - racial bias and the +death penalty. + +## A question of life and death + +This example comes from the excellent Berkeley introduction to data science +[@adhikari2021data8]. + +Robert Swain was a young black man who was sentenced to death in the early +60s. Swain's trial was held in Talladega County, Alabama. At the time, 26% of +the eligible jurors in that county were black, but every member of Swain's +jury was white. Swain and his legal team appealed to the Alabama Supreme +Court, and then to the [US Supreme +Court](https://en.wikipedia.org/wiki/Swain_v._Alabama), arguing that there was +racial bias in the jury selection. They noted that there had been no black +jurors in Talladega county since 1950, even though they made up about a +quarter of the eligible pool of jurors. The US Supreme Court rejected this +argument, in a 6 to 3 opinion, writing that "The overall percentage disparity +has been small and reflects no studied attempt to include or exclude a +specified number of Negros.". + +Swain's team presented a variety of evidence on bias in jury selection, but +here we will look at the obvious and apparently surprising fact that Swain's +jury was entirely white. The Supreme Court decided that the "disparity" +between selection of white and black jurors "has been small" — but how would +they, and how would we, make a rational decision about whether this disparity +*really* was "small"? + +You might reasonably be worried about the result of this decision for Robert +Swain. In fact his death sentence was invalidated by a [later, unrelated +decision](https://en.wikipedia.org/wiki/Furman_v._Georgia) and he served a +long prison sentence instead. In 1986, the Supreme Court overturned the +precedent set by Swain's case, in [Batson v. Kentucky, 476 U.S. +79](https://supreme.justia.com/cases/federal/us/476/79). + +## A small disparity and a hypothetical world + +To answer the question that the Supreme Court asked, we return to the method we used in the last chapter. + +Let us imagine a hypothetical world, in which each individual black or white +person had an equal chance of being selected for the jury. Call this world Hypothetical County, Alabama. + +Just as in 1960's Talladega County, 26% of eligible jurors in Hypothetical +County are black. Hypothetical County jury selection has no bias against +black people, so we expect around 26% of the jury to be black. 0.26 * 12 = +3.12, so we expect that, on average, just over 3 out of 12 jurors in +a Hypothetical County jury will be black. But, if we select each juror at +random from the population, that means that, sometimes, by chance, we will have +fewer than 3 black jurors, and sometimes will have more than 3 black jurors. +And, by chance, sometimes we will have no black jurors. But, if the jurors +really are selected at random, how often would we expect this to happen — that +there are no black jurors? We would like to estimate the *probability* that +we will get no black jurors. If that probability is small, then we have some +evidence that the disparity in selection between black and white jurors, was +not "small". + +::: {.question} +What is the probability of an *all white* jury being randomly selected out of a population having 26% black people? +::: + +## Designing the experiment + +Before we start, we need to figure out three things: + +1. What do we mean by one trial? +2. What is the outcome of interest from the trial? +3. How do we simulate one trial? + +We then take **three steps** to calculate the desired probability: + +1. *Repeat* the simulated trial procedure N times. +2. *Count* M, the number of trials with an outcome that matches the outcome we + are interested in. +3. Calculate the *proportion*, M/N. + This is an estimate of the probability in question. + +For this problem, our task is made a little easier by the fact that our *trial* (in the resampling sense) is a simulated *trial* (in the legal sense). One trial requires 12 simulated jurors, each labeled by race (white or black). + +The outcome we are interested in is the number of black jurors. + +Now comes the harder part. How do we simulate one trial? + +### One trial + +One trial requires 12 jurors, and we are interested only in the race of each juror. +In Hypothetical County, where selection by race is entirely random, each juror +has a 26% chance of being black. + +We need a way of simulating a 26% chance. + +One way of doing this is by getting a random number from 0 through 99 +(inclusive). There are 100 numbers in the range 0 through 99 (inclusive). + +We will arbitrarily say that the juror is white if the random number is in the +range from 0 through 73. 74 of the 100 numbers are in this range, so +the juror has a 74/100 = 74% chance of getting the label "white". We will say +the juror is black if the random number is in the range 74 though 99. There +are 26 such numbers, so the juror has a 26% chance of getting the label +"black". + +Next we need a way of getting a random number in the range 0 through 99. This +is an easy job for the computer, but if we had to do this with a physical +device, we could get a single number by throwing *two* 10-sided dice, say a +blue die and a green die. The face of the blue die will be the 10s digit, and +the green face will be the ones digit. So, if the blue die comes up with 8 and +the green die has 4, then the random number is 84. + +We could then simulate 12 jurors by repeating this process 12 times, each time +writing down "white" if the number is from 0 through 74, and "black" +otherwise. The trial outcome is the number of times we wrote "black" for these +12 simulated jurors. + +### Using code to simulate a trial + +We use the same logic to simulate a trial with the computer. A little code +makes the job easier, because we can ask {{< var lang >}} to give us 12 random +numbers from 0 through 99, and to count how many of these numbers are in the +range from 75 through 99. Numbers in the range from 75 through 99 correspond +to black jurors. + +### Random numbers from 0 through 99 {#sec-random-zero-through-ninety-nine} + +We can now use {{< var np_or_r >}} and +[the random number functions]{.python}[`sample`]{.r} +from the last chapter to get 12 random numbers from 0 through 99. + +```{python} +# Import the Numpy library, rename as "np" +import numpy as np + +# Ask NumPy for a random number generator. +rnd = np.random.default_rng() + +# All the integers from 0 up to, but not including 100. +zero_thru_99 = np.arange(100) + +# Get 12 random numbers from 0 through 99 +a = rnd.choice(zero_thru_99, size=12) + +# Show the result +a +``` + +```{r} +# Get 12 random numbers from 0 through 99 +a <- sample(0:99, size=12, replace=TRUE) + +# Show the result +a +``` + +#### Counting the jurors + +We use *comparison* and [`np.sum`]{.python}[`sum`]{.r} to count how +many numbers are greater than 74, and therefore, in the range from 75 through +99: + +```{python} +# How many numbers are greater than 74? +b = np.sum(a > 74) +# Show the result +b +``` + +```{r} +# How many numbers are greater than 74? +b <- sum(a > 74) +# Show the result +b +``` + +#### A single simulated trial + +We assemble the pieces from the last few sections to make a {{< var cell >}} +that simulates a single trial: + +```{python} +rnd = np.random.default_rng() +zero_thru_99 = np.arange(100) + +# Get 12 random numbers from 0 through 99 +a = rnd.choice(zero_thru_99, size=12) + +# How many numbers are greater than 74? +b = np.sum(a > 74) + +# Show the result +b +``` + +```{r} +# Get 12 random numbers from 0 through 99 +a <- sample(0:99, size=12, replace=TRUE) +# How many are greater than 74? +b <- sum(a > 74) +# Show the result +b +``` + +## Three simulation steps + +Now we come back to the details of how we: + +1. Repeat the simulated trial many times; +2. record the results for each trial; +3. calculate the required proportion as an estimate of the probability we seek. + +Repeating the trial many times is the job of the `for` loop, and we will come +to that soon. + +In order to record the results, we will store each trial result in {{< var +an_array >}}. + +::::{.callout-note} +### More on {{< var array >}}s + +Since we will be working with {{< var array >}}s a lot, it is worth knowing +more about them. + +A [NumPy array]{.python}[vector]{.r} is a *container* that stores many +elements of the same type. You have already seen, in @sec-resampling-method, +how we can create {{< var an_array >}} from a sequence of +numbers using the [`np.array`]{.python}[`c()`]{.r} function. + +```{python} +# Make an array of numbers, store with the name "some_numbers". +some_numbers = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) +# Show the value of "some_numbers" +some_numbers +``` + +```{r} +# Make a vector of numbers, store with the name "some_numbers". +some_numbers <- c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9) +# Show the value of "some_numbers" +some_numbers +``` + +Another way that we can create {{< var array >}}s is to use the +[`np.zeros`]{.python}[`numeric`]{.r} function to make a new array where all the +elements are 0. + +```{python} +# Make a new array containing 5 zeros. +# store with the name "z". +z = np.zeros(5) +# Show the value of "z" +z +``` + +```{r} +# Make a new vector containing 5 zeros. +z <- numeric(5) +# Show the value of "z" +z +``` + +Notice the argument `5` to the [`np.zeros`]{.python}[`numeric`]{.r} function. +This tells the function how many zeros we want in the {{< var array >}} that +the function will return. + +## {{< var array >}} length {#sec-array-length} + +The are various useful things we can do with this {{< var array >}} container. +One is to ask how many elements there are in the {{< var array >}} container. +We can use the [`len`]{.python}[`length`]{.r} function to calculate the number +of elements in {{< var an_array >}}: + +```{python} +# Show the number of elements in "z" +len(z) +``` + +```{r} +# Show the number of elements in "z" +length(z) +``` + +## Indexing into {{< var array >}}s {#sec-array-indexing} + +Another thing we can do is *set* the value for a particular element in the +{{< var array >}}. To do this, we use square brackets following the +{{< var array >}} value, on the left hand side of the equals sign, +like this: + +```{python} +# Set the value of the *first* element in the array. +z[0] = 99 +# Show the new contents of the array. +z +``` + +```{r} +# Set the value of the first element in the vector. +z[1] = 99 +# Show the new contents of the vector. +z +``` + +Read the first line of code as "the element at position +[0]{.python}[1]{.r} gets a value of 99". + +:::: python +Notice that the position number of the first element in the array is 0, and the +position number of the second element is 1. Think of the position as an +*offset* from the beginning of the array. The first element is at the +beginning of the array, and so it is at offset (position) 0. This can be a +little difficult to get used to at first, but you will find that thinking of +the positions of offsets in this way soon starts to come naturally, and later +you will also find that it helps you to avoid some common mistakes when using +positions for getting and setting values. +::: + +For practice, let us also set the value of the third element in the {{< var +array >}}: + +```{python} +# Set the value of the *third* element in the array. +z[2] = 99 +# Show the new contents of the array. +z +``` + +```{r} +# Set the value of the third element in the vector. +z[3] = 99 +# Show the new contents of the vector. +z +``` + +Read the first code line above as as "set the value at position +[2]{.python}[3]{.r} +in the {{< var array >}} to have the value 99". + +We can also *get* the value of the element at a given position, using the same +square-bracket notation: + +```{python} +# Get the value of the *first* element in the array. +# Store the value with name "v" +v = z[0] +# Show the value we got +v +``` + +```{r} +# Get the value of the *first* element in the array. +# Store the value with name "v" +v = z[1] +# Show the value we got +v +``` + +Read the first code line here as "v gets the value at position +[0]{.python}[1]{.r} +in the {{< var array >}}". + +Using square brackets to get and set element values is called *indexing* into +the {{< var array >}}. +:::: + +### Repeating trials + +As a preview, let us now imagine that we want to do 50 simulated trials of +Robert Swain's jury in Hypothetical County. We will want to store the count +for each trial, to give 50 counts. + +In order to do this, we make {{< var an_array >}} to hold the 50 counts. +Call this {{< var array >}} `z`. + +```{python} +# An array to hold the 50 count values. +z = np.zeros(50) +``` + +```{r} +# A vector to hold the 50 count values. +z <- numeric(50) +``` + +We could run a single trial to get a single simulated count. Here we just +repeat the code {{< var cell >}} you saw above. Notice that we can get a +different result each time we run this code, because the numbers in `a` are +*random* choices from the range 0 through 99, and different random numbers will +give different counts. + +```{python} +rnd = np.random.default_rng() +zero_thru_99 = np.arange(0, 100) +# Get 12 random numbers from 0 through 99 +a = rnd.choice(zero_thru_99, size=12) +# How many numbers are greater than 74? +b = np.sum(a > 74) +# Show the result +b +``` + +```{r} +# Get 12 random numbers from 0 through 99 +a <- sample(0:99, size=12, replace=TRUE) +# How many are greater than 74? +b <- sum(a == 9) +# Show the result +b +``` + +Now we have the result of a single trial, we can store it as the first number +in the `z` {{< var array >}}: + +```{python} +# Store the single trial count as the first value in the "z" array. +z[0] = b +# Show all the values in the "z" array. +z +``` + +```{r} +# Store the single trial count as the first value in the "z" vector. +z[1] <- b +# Show all the values in the "z" vector. +z +``` + +Of course we could just keep doing this: run the {{< var cell >}} corresponding +to a trial, above, to get a new count, and then store it at the next position +in the `z` {{< var array >}}. For example, we could store the counts for the +first three trials with: + +```{python} +# First trial +a = rnd.choice(zero_thru_99, size=12) +b = np.sum(a > 74) +# Store the result at the first position in z +# Remember, the first position is offset 0. +z[0] = b +# Second trial +a = rnd.choice(zero_thru_99, size=12) +b = np.sum(a > 74) +# Store the result at the second position in z +z[1] = b +# Third trial +a = rnd.choice(zero_thru_99, size=12) +b = np.sum(a > 74) +# Store the result at the third position in z +z[2] = b + +# And so on ... +``` + +```{r} +# First trial +a <- sample(0:99, size=12, replace=TRUE) +b <- sum(a == 9) +# Store the result at the first position in z +z[1] <- b + +# Second trial +a <- sample(0:99, size=12, replace=TRUE) +b <- sum(a == 9) +# Store the result at the second position in z +z[2] <- b + +# Third trial +a <- sample(0:99, size=12, replace=TRUE) +b <- sum(a == 9) +# Store the result at the third position in z +z[3] <- b + +# And so on ... +``` + +This would get terribly long and boring to type for 50 trials. Luckily +computer code is very good at repeating the same procedure many times. For +example, {{< var lang >}} can do this using a `for` loop. You have already seen +a preview of the `for` loop in @sec-resampling-method. Here we dive into `for` +loops in more depth. + +### For-loops in {{< var lang >}} {#sec-for-loops} + +A for-loop is a way of asking {{< var lang >}} to: + +* Take a sequence of things, one by one, and +* Do the same task on each one. + +We often use this idea when we are trying to explain a repeating procedure. +For example, imagine we wanted to explain what the supermarket checkout person +does for the items in your shopping basket. You might say that they do this: + +> For each item of shopping in your basket, they take the item off the conveyor +> belt, scan it, and put it on the other side of the till. + +You could also break this description up into bullet points with indentation, +to say the same thing: + +* For each item from your shopping basket, they: + * Take the item off the conveyor belt. + * Scan the item. + * Put it on the other side of the till. + +Notice the logic; the checkout person is repeating the same procedure for each +of a series of items. + +This is the logic of the `for` loop in {{< var lang >}}. The procedure that +{{< var lang >}} repeats is called the *body of the for loop*. In the example +of the checkout person above, the repeating procedure is: + + * Take the item off the conveyor belt. + * Scan the item. + * Put it on the other side of the till. + +Now imagine we wanted to use {{< var lang >}} to print out the year of birth +for each of the authors for the third edition of this book: + +| Author | Year of birth | +| ------ | ---- | +| Julian Lincoln Simon | 1932 | +| Matthew Brett | 1964 | +| Stéfan van der Walt | 1980 | +| Ian Nimmo-Smith | 1944 | + +We want to see this output: + +``` +Author birth year is 1932 +Author birth year is 1964 +Author birth year is 1980 +Author birth year is 1944 +``` + +Of course, we could just ask {{< var lang >}} to print out these exact lines, +like this: + +```{python} +print('Author birth year is 1932') +print('Author birth year is 1964') +print('Author birth year is 1980') +print('Author birth year is 1944') +``` + +```{r} +message('Author birth year is 1932') +message('Author birth year is 1964') +message('Author birth year is 1980') +message('Author birth year is 1944') +``` + +We might instead notice that we are repeating the same procedure for each of the +four birth years, and decide to do the same thing using a `for` loop: + +```{python} +author_birth_years = np.array([1932, 1964, 1980, 1944]) + +# For each birth year +for birth_year in author_birth_years: + # Repeat this procedure ... + print('Author birth year is', birth_year) +``` + +```{r} +author_birth_years <- c(1932, 1964, 1980, 1944) + +# For each birth year +for (birth_year in author_birth_years) { + # Repeat this procedure ... + message('Author birth year is ', birth_year) +} +``` + +The `for` loop starts with a line where we tell it what items we want to repeat +the procedure for: + +::: python + +``` +for birth_year in author_birth_years: +``` + +This *initial line* of the `for` loop ends with a colon. + +The next thing in the `for` loop is the procedure Python should follow for each +item. Python knows that the following lines are the procedure it should +repeat, because the lines are *indented*. The *indented* lines are the *body of +the for loop*. +::: + +::: r + +``` +for (birth_year in author_birth_years) { +``` + +This *initial line* of the `for` loop ends with an *opening curly brace* `{`. +The opening curly brace tells R that what follows, up until the matching +closing curly brace `}`, is the procedure R should follow for each item. The +lines between the opening `{` and closing `}` curly braces* are the *body of +the for loop*. +::: + +The initial line of the `for` loop above tells {{< var lang >}} that it should +take *each item* in `author_birth_years`, one by one — first 1932, then 1964, +then 1980, then 1944. For each of these numbers it will: + +* Put the number into the variable `birth_year`, then +* Run the [indented]{.python} code [between the curly braces]{.r}. + +Just as the person at the supermarket checkout takes each item in turn, for +each iteration (repeat) of the `for` loop, `birth_year` gets a new value from +the sequence in `author_birth_years`. `birth_year` is called the *loop +variable*, because it is the variable that gets a new value each time we begin +a new iteration of the `for` loop procedure. As for any variable in {{< var +lang >}}, we can call our loop variable anything we like. We used `birth_year` +here, but we could have used `y` or `year` or some other name. + +::: r + +Notice that R insists we put parentheses (round brackets) around: the loop +variable; `in`; and the sequence that will fill the loop variable — like this: + +``` +for (birth_year in author_birth_years) { +``` + +Do not forget these round brackets — R insists on them. +::: + +Now you know what the `for` loop is doing, you can see that the `for` loop +above is equivalent to the following code: + +```{python} +birth_year = 1932 # Set the loop variable to contain the first value. +print('Author birth year is', birth_year) # Use it. +birth_year = 1964 # Set the loop variable to contain the next value. +print('Author birth year is', birth_year) # Use the second value. +birth_year = 1980 +print('Author birth year is', birth_year) +birth_year = 1944 +print('Author birth year is', birth_year) +``` + +```{r} +birth_year <- 1932 # Set the loop variable to contain the first value. +message('Author birth year is ', birth_year) # Use the first value. +birth_year <- 1964 # Set the loop variable to contain the next value. +message('Author birth year is ', birth_year) # Use the second value. +birth_year <- 1980 +message('Author birth year is ', birth_year) +birth_year <- 1944 +message('Author birth year is ', birth_year) +``` + +Writing the steps in the `for` loop out like this is called *unrolling* the +loop. It can be a useful exercise to do this when you come across a `for` +loop, in order to work through the logic of the loop. For example, you may want +to write out the unrolled equivalent of the first couple of iterations, to see +what the loop variable will be, and what will happen in the body of the loop. + +We often use `for` loops with ranges (see @sec-ranges). Here we use a loop to +print out the numbers [0 through 3]{.python}[1 through 4]{.r}: + +```{python} +for n in np.arange(0, 4): + print('The loop variable n is', n) +``` + +```{r} +for (n in 1:4) { + message('The loop variable n is ', n) +} +``` + +Notice that the range ended at [(the number before)]{.python} 4, and that means +we repeat the loop body 4 times. We can also use the loop variable value from +the range as an *index*, to get or set the first, second, etc values from {{< +var an_array >}}. + +For example, maybe we would like to show the author position *and* the author +year of birth. + +Remember our author birth years: + +```{python} +author_birth_years +``` + +```{r} +author_birth_years +``` + +We can get (for example) the second author birth year with: + +```{python} +author_birth_years[1] +``` + +::: python +Remember, for Python, the first element is position 0, so the second element is +position 1. +::: + +```{r} +author_birth_years[2] +``` + +Using the combination of looping over a range, and {{< var array >}} indexing, +we can print out the author position *and* the author birth year: + +```{python} +for n in np.arange(0, 4): + year = author_birth_years[n] + print('Birth year of author position', n, 'is', year) +``` + +::: python +Again, remember Python considers 0 as the first position. +::: + +```{r} +for (n in 1:4) { + year <- author_birth_years[n] + message('Birth year of author position ', n, ' is ', year) +} +``` + +Just for practice, let us unroll the first two iterations through this `for` +loop, to remind ourselves what the code is doing: + +```{python} +# Unrolling the for loop. +n = 0 +year = author_birth_years[n] # Will be 1932 +print('Birth year of author position', n, 'is', year) +n = 1 +year = author_birth_years[n] # Will be 1964 +print('Birth year of author position', n, 'is', year) +# And so on. +``` + +```{r} +# Unrolling the for loop. +n <- 1 +year <- author_birth_years[n] # Will be 1932 +message('Birth year of author position ', n, ' is ', year) +n <- 2 +year <- author_birth_years[n] # Will be 1964 +message('Birth year of author position ', n, ' is ', year) +# And so on. +``` + +::: python + +### `range` in Python `for` loops {#sec-for-range} + +So far we have used `np.arange` to give us the sequence of integers that we +feed into the `for` loop. But — as you saw in @sec-python-range — we can also +get a range of numbers from Python's `range` function. `range` is a common +and useful alternative way to provide a range of numbers to a `for` loop. + +You have just seen how we would use `np.arange` to send the numbers 0, 1, 2, +and 3 to a `for` loop, in the example above, repeated here: + +```{python} +for n in np.arange(0, 4): + year = author_birth_years[n] + print('Birth year of author position', n, 'is', year) +``` + +We could also use `range` instead of `np.arange` to do the same task: + +```{python} +for n in range(0, 4): + year = author_birth_years[n] + print('Birth year of author position', n, 'is', year) +``` + +In fact, you will see this pattern throughout the book, where we use `for` +statements like `for value in range(10000):` to ask Python to put each number +in the range 0 up to (not including) 100000 into the variable `value`, and then +do something in the body of the loop. Just to be clear, we could always, and +almost as easily, write `for value in np.arange(10000):` to do the same task. +But — even though we could use `np.arange` to get an array of numbers, we +generally prefer `range` in our Python `for` loops, because it is just a little +less typing (without the `np.a` of `np.arange`, and because it is a more common +pattern in standard Python code.[^range-efficiency] + +[^range-efficiency]: Actually, there is a reason why many Python programmers + prefer `range` in their `for` loops to `np.arange`. `range` is a very + efficient container, in that it doesn't need to take up all the memory + required to create the full array, it just needs to keep track of the number + to give you next. For example, consider `for i in np.arange(10000000):` — in + this case Python has to make an array with 10,000,000 elements, and then, + from that array, it passes each value one by one to the `for` loop. On the + other hand, `for i in range(10000000):` will do the job just as well, + passing the same sequence of 0 through 9,999,999 to `i`, one by one, but + `range(10000000)` never has to make the whole 10,000,000 element array — it + just needs to keep track of which number to give up next. Therefore `range` + is very quick, and very efficient in memory. This doesn't have any great + practical impact for the arrays we are using here, typically of 10,0000 + elements or so, but it is worthwhile for larger arrays. + +::: + +### Putting it all together + +Here is the code we worked out above, to implement a single trial: + +```{python} +rnd = np.random.default_rng() +zero_thru_99 = np.arange(0, 100) +# Get 12 random numbers from 0 through 99 +a = rnd.choice(zero_thru_99, size=12) +# How many numbers are greater than 74? +b = np.sum(a > 74) +# Show the result +b +``` + +```{r} +# Get 12 random numbers from 0 through 99 +a <- sample(0:99, size=12, replace=TRUE) +# How many are greater than 74? +b <- sum(a == 9) +# Show the result +b +``` + +We found that we could use {{< var array >}}s to store the results of these +trials, and that we could use `for` loops to repeat the same procedure many +times. + +Now we can put these parts together to do 50 simulated trials: + +```{python} +# Procedure for 50 simulated trials. + +# The Numpy random number generator. +rnd = np.random.default_rng() + +# All the numbers from 0 through 99. +zero_through_99 = np.arange(0, 100) + +# An array to store the counts for each trial. +z = np.zeros(50) + +# Repeat the trial procedure 50 times. +for i in np.arange(0, 50): + # Get 12 random numbers from 0 through 99 + a = rnd.choice(zero_through_99, size=12) + # How many numbers are greater than 74? + b = np.sum(a > 74) + # Store the result at the next position in the "z" array. + z[i] = b + # Now go back and do the next trial until finished. +# Show the result of all 50 trials. +z +``` + +```{r} +# Procedure for 50 simulated trials. + +# A vector to store the counts for each trial. +z <- numeric(50) + +# Repeat the trial procedure 50 times. +for (i in 1:50) { + # Get 12 random numbers from 0 through 99 + a <- sample(0:99, size=12, replace=TRUE) + # How many are greater than 74? + b <- sum(a > 74) + # Store the result at the next position in the "z" vector. + z[i] = b + # Now go back and do the next trial until finished. +} +# Show the result of all 50 trials. +z +``` + +Finally, we need to count how many of the trials in `z` ended up with all-white +juries. These are the trials with a `z` (count) value of 0. + +To do this, we can ask {{< var an_array >}} which elements match a certain +condition. E.g.: + +```{python} +x = np.array([2, 1, 3, 0]) +y = x < 2 +# Show the result +y +``` + +```{r} +x <- c(2, 1, 3, 0) +y = x < 2 +# Show the result +y +``` + +We now use that same technique to ask, of *each of the 50 counts*, whether the +{{< var array >}} `z` is equal to 0, like this: + +```{python} +# Is the value of z equal to 0? +all_white = z == 0 +# Show the result of the comparison. +all_white +``` + +```{r} +# Is the value of z equal to 0? +all_white <- z == 0 +# Show the result of the comparison. +all_white +``` + +We need to get the number of {{< var true >}} values in `all_white`, to find +how many simulated trials gave all-white juries. + +```{python} +# Count the number of True values in "all_white" +# This is the same as the number of values in "z" that are equal to 0. +n_all_white = np.sum(all_white) +# Show the result of the comparison. +n_all_white +``` + +```{r} +# Count the number of True values in "all_white" +# This is the same as the number of values in "z" that are equal to 0. +n_all_white = sum(all_white) +# Show the result of the comparison. +n_all_white +``` + +`n_all_white` is the number of simulated trials for which all the jury members +were white. It only remains to get the proportion of trials for which this was +true, and to do this, we divide by the number of trials. + +```{python} +# Proportion of trials where all jury members were white. +p = n_all_white / 50 +# Show the result +p +``` + +```{r} +# Proportion of trials where all jury members were white. +p <- n_all_white / 50 +# Show the result +p +``` + +From this initial simulation, it seems there is around a +`r round(get_var('p') * 100, 1)`% chance that a jury selected randomly from +the population, which was 26% black, would have no black jurors. + +## Many many trials + +Our experiment above is only 50 simulated trials. The higher the number of +trials, the more confident we can be of our estimate for `p` — the proportion of trials where we get an all-white jury. + +It is no extra trouble for us to tell the computer to do a very large number +of trials. For example, we might want to run 10,000 trials instead of 50. +All we have to do is to run the loop 10,000 times instead of 50 times. The +computer has to do more work, but it is more than up to the job. + +Here is exactly the same code we ran above, but collected into one {{< var +cell >}}, and using 10,000 trials instead of 50. We have left out the +comments, to make the code more compact. + +```{python} +# Full simulation procedure, with 10,000 trials. +rnd = np.random.default_rng() +zero_through_99 = np.arange(0, 100) +# 10,000 trials. +z = np.zeros(10000) +for i in np.arange(0, 10000): + a = rnd.choice(zero_through_99, size=12) + b = np.sum(a > 74) + z[i] = b +all_white = z == 0 +n_all_white = sum(all_white) +p = n_all_white / 10000 +p +``` + +```{r} +# Full simulation procedure, with 10,000 trials. +z <- numeric(10000) +for (i in 1:10000) { + a <- sample(0:99, size=12, replace=TRUE) + b <- sum(a > 74) + z[i] = b +} +all_white <- z == 0 +n_all_white <- sum(all_white) +p <- n_all_white / 10000 +p +``` + +We now have a new, more accurate estimate of the proportion of Hypothetical County juries with all-white juries. The proportion is +`r round(get_var('p'), 3)`, and so +`r round(get_var('p') * 100, 1)`%. + +This proportion means that, for any one jury from Hypothetical County, there +is a less than one in 20 chance that the jury would be all white. + +As we will see in more detail later, we might consider using the results from +this experiment in Hypothetical County, to reflect on the result we saw in the +real Talladega County. We might conclude, for example, that there was likely +some systematic difference between Hypothetical County and Talledega County. +Maybe the difference was that there was, in fact, some bias in the jury +selection in Talledega county, and that the Supreme Court was wrong to reject +this. You will hear more of this line of reasoning later in the book. + +## Conclusion + +In this chapter we studied a real life-and-death question, on racial bias and +the death penalty. We continued our exploration of the ways we can use +probability, and resampling, to draw conclusions about real events. Along the +way, we went into more detail on {{< var array >}}s in {{< var lang >}}, and +`for` loops; two basic tools in resampling. + +In the next chapter, we will work through some more problems in probability, +to show how we can use resampling, to answer questions about chance. We will +add some more tools for writing code in {{< var lang >}}, to make your +programs easier to write, read, and understand.