diff --git a/pandas.ipynb b/pandas.ipynb index f0a8a33..c20528c 100644 --- a/pandas.ipynb +++ b/pandas.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -153,15 +153,169 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_billtipsexsmokerdaytimesize
016.991.01FemaleNoSunDinner2
110.341.66MaleNoSunDinner3
221.013.50MaleNoSunDinner3
323.683.31MaleNoSunDinner2
424.593.61FemaleNoSunDinner4
525.294.71MaleNoSunDinner4
68.772.00MaleNoSunDinner2
726.883.12MaleNoSunDinner4
815.041.96MaleNoSunDinner2
914.783.23MaleNoSunDinner2
\n", + "
" + ], + "text/plain": [ + " total_bill tip sex smoker day time size\n", + "0 16.99 1.01 Female No Sun Dinner 2\n", + "1 10.34 1.66 Male No Sun Dinner 3\n", + "2 21.01 3.50 Male No Sun Dinner 3\n", + "3 23.68 3.31 Male No Sun Dinner 2\n", + "4 24.59 3.61 Female No Sun Dinner 4\n", + "5 25.29 4.71 Male No Sun Dinner 4\n", + "6 8.77 2.00 Male No Sun Dinner 2\n", + "7 26.88 3.12 Male No Sun Dinner 4\n", + "8 15.04 1.96 Male No Sun Dinner 2\n", + "9 14.78 3.23 Male No Sun Dinner 2" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "\n", "# load a dataframe from disk\n", "tips = pd.read_csv(\"tips.csv\")\n", - "tips.head(5)" + "tips.head(10)" ] }, { @@ -180,6 +334,25 @@ "\n" ] }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n" + ] + } + ], + "source": [ + "print(type(tips.total_bill))\n", + "print(type(tips))" + ] + }, { "cell_type": "markdown", "metadata": { @@ -203,7 +376,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -213,9 +386,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Accessing a certain value via the index\n", "\n", @@ -224,21 +408,78 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 1. 15. -5. nan 4. 123. 0. 78. 0. 5. -4.]\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 1.01, 1.66, 3.5 , 3.31, 3.61, 4.71, 2. , 3.12, 1.96,\n", + " 3.23, 1.71, 5. , 1.57, 3. , 3.02, 3.92, 1.67, 3.71,\n", + " 3.5 , 3.35, 4.08, 2.75, 2.23, 7.58, 3.18, 2.34, 2. ,\n", + " 2. , 4.3 , 3. , 1.45, 2.5 , 3. , 2.45, 3.27, 3.6 ,\n", + " 2. , 3.07, 2.31, 5. , 2.24, 2.54, 3.06, 1.32, 5.6 ,\n", + " 3. , 5. , 6. , 2.05, 3. , 2.5 , 2.6 , 5.2 , 1.56,\n", + " 4.34, 3.51, 3. , 1.5 , 1.76, 6.73, 3.21, 2. , 1.98,\n", + " 3.76, 2.64, 3.15, 2.47, 1. , 2.01, 2.09, 1.97, 3. ,\n", + " 3.14, 5. , 2.2 , 1.25, 3.08, 4. , 3. , 2.71, 3. ,\n", + " 3.4 , 1.83, 5. , 2.03, 5.17, 2. , 4. , 5.85, 3. ,\n", + " 3. , 3.5 , 1. , 4.3 , 3.25, 4.73, 4. , 1.5 , 3. ,\n", + " 1.5 , 2.5 , 3. , 2.5 , 3.48, 4.08, 1.64, 4.06, 4.29,\n", + " 3.76, 4. , 3. , 1. , 4. , 2.55, 4. , 3.5 , 5.07,\n", + " 1.5 , 1.8 , 2.92, 2.31, 1.68, 2.5 , 2. , 2.52, 4.2 ,\n", + " 1.48, 2. , 2. , 2.18, 1.5 , 2.83, 1.5 , 2. , 3.25,\n", + " 1.25, 2. , 2. , 2. , 2.75, 3.5 , 6.7 , 5. , 5. ,\n", + " 2.3 , 1.5 , 1.36, 1.63, 1.73, 2. , 2.5 , 2. , 2.74,\n", + " 2. , 2. , 5.14, 5. , 3.75, 2.61, 2. , 3.5 , 2.5 ,\n", + " 2. , 2. , 3. , 3.48, 2.24, 4.5 , 1.61, 2. , 10. ,\n", + " 3.16, 5.15, 3.18, 4. , 3.11, 2. , 2. , 4. , 3.55,\n", + " 3.68, 5.65, 3.5 , 6.5 , 3. , 5. , 3.5 , 2. , 3.5 ,\n", + " 4. , 1.5 , 4.19, 2.56, 2.02, 4. , 1.44, 2. , 5. ,\n", + " 2. , 2. , 4. , 2.01, 2. , 2.5 , 4. , 3.23, 3.41,\n", + " 3. , 2.03, 2.23, 2. , 5.16, 9. , 2.5 , 6.5 , 1.1 ,\n", + " 3. , 1.5 , 1.44, 3.09, 2.2 , 3.48, 1.92, 3. , 1.58,\n", + " 2.5 , 2. , 3. , 2.72, 2.88, 2. , 3. , 3.39, 1.47,\n", + " 3. , 1.25, 1. , 1.17, 4.67, 5.92, 2. , 2. , 1.75,\n", + " 3. ])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Note that there are a bunch of attributes.\n", "# .values returns a numpy ndarray of the values! \n", "\n", - "my_series.values" + "print(my_series.values)\n", + "tips.tip.values" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "RangeIndex(start=0, stop=11, step=1)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Take a look at the index. What type is it? \n", "# You convert itto a numpy ndarray by adding \".values\" again!\n", @@ -248,9 +489,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "om 1.0\n", + "ir 15.0\n", + "os -5.0\n", + "pap NaN\n", + "pas 4.0\n", + "pil 123.0\n", + "io 0.0\n", + "po 78.0\n", + "ulos 0.0\n", + "is 5.0\n", + "best -4.0\n", + "dtype: float64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# You can overwrite the index directly: \n", "\n", @@ -276,9 +539,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(1.0, 1.0)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Series that have string indices can also be accessed via a RangeIndex\n", "# (which is similar to the index of a regular Python list)\n", @@ -288,9 +562,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(1.0, -5.0)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Note that indices can get moved around, by sorting for example!\n", "# iloc gives you the element you would get if the Series\n", @@ -321,7 +606,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": { "slideshow": { "slide_type": "slide" @@ -364,7 +649,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "slideshow": { "slide_type": "slide" @@ -414,7 +699,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -426,9 +711,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "om False\n", + "ir False\n", + "os False\n", + "pap False\n", + "pas False\n", + "pil False\n", + "io True\n", + "po False\n", + "ulos True\n", + "is False\n", + "best False\n", + "dtype: bool" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# getting a boolean-valued series by checking a condition\n", "\n", @@ -438,14 +745,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "io 0.0\n", + "ulos 0.0\n", + "dtype: float64\n", + "io 0.0\n", + "ulos 0.0\n", + "dtype: float64\n" + ] + } + ], "source": [ "# Notice the index of x is a SUBSET of the index of \"my_series\"\n", "# This can be useful when needing to relate values back to the original \"my_series\"!\n", "\n", - "x = my_series[choose]" + "x = my_series[choose]\n", + "print(x)\n", + "y = my_series[my_series == 0]\n", + "print(y)" ] }, { @@ -468,16 +791,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "om 1.0\n", + "ir 15.0\n", + "os -5.0\n", + "pas 4.0\n", + "pil 123.0\n", + "po 78.0\n", + "is 5.0\n", + "best -4.0\n", + "dtype: float64\n" + ] + } + ], "source": [ "# Challenge: \n", "\n", "# Filter \"my_series\" to be all the elements that are NOT\n", "# equal to 0, using the \"choose\" boolean mask below: \n", "\n", - "choose = my_series == 0.0\n" + "choose = my_series == 0.0\n", + "not_zero = my_series[(~choose) & (~np.isnan(my_series))]\n", + "print(not_zero)\n" ] }, { @@ -543,9 +884,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 foo\n", + "1 bar\n", + "3 baz\n", + "4 qux\n", + "dtype: object" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge: \n", "# Get a list of names, without the Null values!\n", @@ -554,7 +910,8 @@ "# 1. Create a boolean mask by using the .notna() method.\n", "# 2. Use the mask to subset the Series.\n", "\n", - "names = pd.Series(['foo','bar',None,'baz','qux',None])\n" + "names = pd.Series(['foo','bar',None,'baz','qux',None])\n", + "names[names.notna()]\n" ] }, { @@ -583,9 +940,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 67, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 foo\n", + "1 bar\n", + "2 None\n", + "3 foo\n", + "4 None\n", + "5 bar\n", + "6 bar\n", + "7 foo\n", + "8 None\n", + "dtype: object" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge: \n", "\n", @@ -598,23 +975,41 @@ " # Your code here\n", " # HINT: delete the \"pass\" when your done\n", " # HINT2: handle None values!\n", - " pass\n", - "\n", + " try: \n", + " return s.lower()\n", + " except AttributeError:\n", + " return None\n", + " \n", "\n", - "names = pd.Series(['Foo', 'BAR', None, 'foo', None, 'bar', 'bAR', 'foo', None])" + "names = pd.Series(['Foo', 'BAR', None, 'foo', None, 'bar', 'bAR', 'foo', None])\n", + "names.map(lower)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 69, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "bar 3\n", + "foo 3\n", + "dtype: int64" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge: \n", "\n", "# Using the series from above, now lowercased, count the occurences of each name\n", "# Hint: It's simple, just use .value_counts()!\n", - "\n" + "lowered = names.map(lower)\n", + "lowered.value_counts()\n" ] }, { @@ -636,19 +1031,185 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_billtipsexsmokerdaytimesize
016.991.01FemaleNoSunDinner2
110.341.66MaleNoSunDinner3
221.013.50MaleNoSunDinner3
323.683.31MaleNoSunDinner2
424.593.61FemaleNoSunDinner4
525.294.71MaleNoSunDinner4
68.772.00MaleNoSunDinner2
726.883.12MaleNoSunDinner4
815.041.96MaleNoSunDinner2
914.783.23MaleNoSunDinner2
\n", + "
" + ], + "text/plain": [ + " total_bill tip sex smoker day time size\n", + "0 16.99 1.01 Female No Sun Dinner 2\n", + "1 10.34 1.66 Male No Sun Dinner 3\n", + "2 21.01 3.50 Male No Sun Dinner 3\n", + "3 23.68 3.31 Male No Sun Dinner 2\n", + "4 24.59 3.61 Female No Sun Dinner 4\n", + "5 25.29 4.71 Male No Sun Dinner 4\n", + "6 8.77 2.00 Male No Sun Dinner 2\n", + "7 26.88 3.12 Male No Sun Dinner 4\n", + "8 15.04 1.96 Male No Sun Dinner 2\n", + "9 14.78 3.23 Male No Sun Dinner 2" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import pandas as pd\n", "tips = pd.read_csv(\"tips.csv\")\n", "tips.head(10) # the first method of our dataframe object! " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 73, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# the other important attribute: name of rows and columns\n", "tips.index\n", @@ -679,6 +1240,26 @@ "```" ] }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.48929877523035775" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips['size'].corr(tips.tip)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -733,21 +1314,133 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 78, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sexsmoker
1MaleNo
3MaleNo
\n", + "
" + ], + "text/plain": [ + " sex smoker\n", + "1 Male No\n", + "3 Male No" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Accessing rows AND columns!\n", "# Example of 2-dimension loc\n", "\n", + "\n", "tips.loc[[1,3], ['sex', 'smoker']]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 81, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sexsmokerdaytimesize
1MaleNoSunDinner3
3MaleNoSunDinner2
\n", + "
" + ], + "text/plain": [ + " sex smoker day time size\n", + "1 Male No Sun Dinner 3\n", + "3 Male No Sun Dinner 2" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Accessing rows AND columns!\n", "# Example of 2-dimensional iloc\n", @@ -757,15 +1450,232 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 83, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tipsexday
204.08MaleSat
212.75FemaleSat
222.23FemaleSat
237.58MaleSat
243.18MaleSat
252.34MaleSat
262.00MaleSat
272.00MaleSat
284.30MaleSat
293.00FemaleSat
301.45MaleSat
312.50MaleSat
323.00FemaleSat
332.45FemaleSat
343.27MaleSat
353.60MaleSat
362.00MaleSat
373.07FemaleSat
382.31MaleSat
395.00MaleSat
402.24MaleSat
412.54MaleSun
423.06MaleSun
431.32MaleSun
445.60MaleSun
\n", + "
" + ], + "text/plain": [ + " tip sex day\n", + "20 4.08 Male Sat\n", + "21 2.75 Female Sat\n", + "22 2.23 Female Sat\n", + "23 7.58 Male Sat\n", + "24 3.18 Male Sat\n", + "25 2.34 Male Sat\n", + "26 2.00 Male Sat\n", + "27 2.00 Male Sat\n", + "28 4.30 Male Sat\n", + "29 3.00 Female Sat\n", + "30 1.45 Male Sat\n", + "31 2.50 Male Sat\n", + "32 3.00 Female Sat\n", + "33 2.45 Female Sat\n", + "34 3.27 Male Sat\n", + "35 3.60 Male Sat\n", + "36 2.00 Male Sat\n", + "37 3.07 Female Sat\n", + "38 2.31 Male Sat\n", + "39 5.00 Male Sat\n", + "40 2.24 Male Sat\n", + "41 2.54 Male Sun\n", + "42 3.06 Male Sun\n", + "43 1.32 Male Sun\n", + "44 5.60 Male Sun" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge:\n", "\n", "# Using the tips dataframe, create a new one that contains the \n", "# information contained in all rows between the 20th (inclusive) \n", - "# and the 45th (exclusive) and only the columns: tip, sex, day" + "# and the 45th (exclusive) and only the columns: tip, sex, day\n", + "\n", + "tips.loc[20:44, ['tip', 'sex', 'day']]" ] }, { @@ -834,9 +1744,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 100, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.40463164280330477\n" + ] + }, + { + "data": { + "text/plain": [ + "0.40463164280330477" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge:\n", "\n", @@ -844,7 +1772,43 @@ "# tip and size for only Male clients during Dinner. \n", "\n", "# HINT: Remember that \"size\" cannot be accessed via dot notation, as it's an \n", - "# attribute of the series!" + "# attribute of the series!\n", + "\n", + "print(tips[(tips.sex == 'Male') & (tips.time == 'Dinner')].tip.corr(tips[(tips.sex == 'Male') & (tips.time == 'Dinner')]['size']))\n", + "\n", + "male_dinner = tips[(tips.sex == 'Male') & (tips.time == 'Dinner')]\n", + "male_dinner.tip.corr(male_dinner['size'])" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Dinner\n", + "1 Dinner\n", + "2 Dinner\n", + "3 Dinner\n", + "4 Dinner\n", + " ... \n", + "239 Dinner\n", + "240 Dinner\n", + "241 Dinner\n", + "242 Dinner\n", + "243 Dinner\n", + "Name: time, Length: 244, dtype: object" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips.time" ] }, { @@ -864,21 +1828,219 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 104, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_billtip
size
17.2425001.437500
216.4480132.582308
323.2776323.393158
428.6135144.135405
530.0680004.028000
634.8300005.225000
\n", + "
" + ], + "text/plain": [ + " total_bill tip\n", + "size \n", + "1 7.242500 1.437500\n", + "2 16.448013 2.582308\n", + "3 23.277632 3.393158\n", + "4 28.613514 4.135405\n", + "5 30.068000 4.028000\n", + "6 34.830000 5.225000" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Group tips dataframe by size of table\n", "by_size = tips.groupby(\"size\")\n", "\n", - "by_size" + "by_size.mean()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 105, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[(1, total_bill tip sex smoker day time size\n", + " 67 3.07 1.00 Female Yes Sat Dinner 1\n", + " 82 10.07 1.83 Female No Thur Lunch 1\n", + " 111 7.25 1.00 Female No Sat Dinner 1\n", + " 222 8.58 1.92 Male Yes Fri Lunch 1),\n", + " (2, total_bill tip sex smoker day time size\n", + " 0 16.99 1.01 Female No Sun Dinner 2\n", + " 3 23.68 3.31 Male No Sun Dinner 2\n", + " 6 8.77 2.00 Male No Sun Dinner 2\n", + " 8 15.04 1.96 Male No Sun Dinner 2\n", + " 9 14.78 3.23 Male No Sun Dinner 2\n", + " .. ... ... ... ... ... ... ...\n", + " 237 32.83 1.17 Male Yes Sat Dinner 2\n", + " 240 27.18 2.00 Female Yes Sat Dinner 2\n", + " 241 22.67 2.00 Male Yes Sat Dinner 2\n", + " 242 17.82 1.75 Male No Sat Dinner 2\n", + " 243 18.78 3.00 Female No Thur Dinner 2\n", + " \n", + " [156 rows x 7 columns]),\n", + " (3, total_bill tip sex smoker day time size\n", + " 1 10.34 1.66 Male No Sun Dinner 3\n", + " 2 21.01 3.50 Male No Sun Dinner 3\n", + " 16 10.33 1.67 Female No Sun Dinner 3\n", + " 17 16.29 3.71 Male No Sun Dinner 3\n", + " 18 16.97 3.50 Female No Sun Dinner 3\n", + " 19 20.65 3.35 Male No Sat Dinner 3\n", + " 35 24.06 3.60 Male No Sat Dinner 3\n", + " 36 16.31 2.00 Male No Sat Dinner 3\n", + " 37 16.93 3.07 Female No Sat Dinner 3\n", + " 38 18.69 2.31 Male No Sat Dinner 3\n", + " 39 31.27 5.00 Male No Sat Dinner 3\n", + " 40 16.04 2.24 Male No Sat Dinner 3\n", + " 48 28.55 2.05 Male No Sun Dinner 3\n", + " 64 17.59 2.64 Male No Sat Dinner 3\n", + " 65 20.08 3.15 Male No Sat Dinner 3\n", + " 71 17.07 3.00 Female No Sat Dinner 3\n", + " 102 44.30 2.50 Female Yes Sat Dinner 3\n", + " 112 38.07 4.00 Male No Sun Dinner 3\n", + " 114 25.71 4.00 Female No Sun Dinner 3\n", + " 129 22.82 2.18 Male No Thur Lunch 3\n", + " 146 18.64 1.36 Female No Thur Lunch 3\n", + " 152 17.26 2.74 Male No Sun Dinner 3\n", + " 162 16.21 2.00 Female No Sun Dinner 3\n", + " 165 24.52 3.48 Male No Sun Dinner 3\n", + " 170 50.81 10.00 Male Yes Sat Dinner 3\n", + " 182 45.35 3.50 Male Yes Sun Dinner 3\n", + " 186 20.90 3.50 Female Yes Sun Dinner 3\n", + " 188 18.15 3.50 Female Yes Sun Dinner 3\n", + " 189 23.10 4.00 Male Yes Sun Dinner 3\n", + " 200 18.71 4.00 Male Yes Thur Lunch 3\n", + " 205 16.47 3.23 Female Yes Thur Lunch 3\n", + " 206 26.59 3.41 Male Yes Sat Dinner 3\n", + " 210 30.06 2.00 Male Yes Sat Dinner 3\n", + " 214 28.17 6.50 Female Yes Sat Dinner 3\n", + " 223 15.98 3.00 Female No Fri Lunch 3\n", + " 231 15.69 3.00 Male Yes Sat Dinner 3\n", + " 238 35.83 4.67 Female No Sat Dinner 3\n", + " 239 29.03 5.92 Male No Sat Dinner 3),\n", + " (4, total_bill tip sex smoker day time size\n", + " 4 24.59 3.61 Female No Sun Dinner 4\n", + " 5 25.29 4.71 Male No Sun Dinner 4\n", + " 7 26.88 3.12 Male No Sun Dinner 4\n", + " 11 35.26 5.00 Female No Sun Dinner 4\n", + " 13 18.43 3.00 Male No Sun Dinner 4\n", + " 23 39.42 7.58 Male No Sat Dinner 4\n", + " 25 17.81 2.34 Male No Sat Dinner 4\n", + " 31 18.35 2.50 Male No Sat Dinner 4\n", + " 33 20.69 2.45 Female No Sat Dinner 4\n", + " 44 30.40 5.60 Male No Sun Dinner 4\n", + " 47 32.40 6.00 Male No Sun Dinner 4\n", + " 52 34.81 5.20 Female No Sun Dinner 4\n", + " 54 25.56 4.34 Male No Sun Dinner 4\n", + " 56 38.01 3.00 Male Yes Sat Dinner 4\n", + " 59 48.27 6.73 Male No Sat Dinner 4\n", + " 63 18.29 3.76 Male Yes Sat Dinner 4\n", + " 77 27.20 4.00 Male No Thur Lunch 4\n", + " 85 34.83 5.17 Female No Thur Lunch 4\n", + " 95 40.17 4.73 Male Yes Fri Dinner 4\n", + " 116 29.93 5.07 Male No Sun Dinner 4\n", + " 119 24.08 2.92 Female No Thur Lunch 4\n", + " 153 24.55 2.00 Male No Sun Dinner 4\n", + " 154 19.77 2.00 Male No Sun Dinner 4\n", + " 157 25.00 3.75 Female No Sun Dinner 4\n", + " 159 16.49 2.00 Male No Sun Dinner 4\n", + " 160 21.50 3.50 Male No Sun Dinner 4\n", + " 167 31.71 4.50 Male No Sun Dinner 4\n", + " 180 34.65 3.68 Male Yes Sun Dinner 4\n", + " 183 23.17 6.50 Male Yes Sun Dinner 4\n", + " 197 43.11 5.00 Female Yes Thur Lunch 4\n", + " 204 20.53 4.00 Male Yes Thur Lunch 4\n", + " 207 38.73 3.00 Male Yes Sat Dinner 4\n", + " 211 25.89 5.16 Male Yes Sat Dinner 4\n", + " 212 48.33 9.00 Male No Sat Dinner 4\n", + " 219 30.14 3.09 Female Yes Sat Dinner 4\n", + " 227 20.45 3.00 Male No Sat Dinner 4\n", + " 230 24.01 2.00 Male Yes Sat Dinner 4),\n", + " (5, total_bill tip sex smoker day time size\n", + " 142 41.19 5.00 Male No Thur Lunch 5\n", + " 155 29.85 5.14 Female No Sun Dinner 5\n", + " 185 20.69 5.00 Male No Sun Dinner 5\n", + " 187 30.46 2.00 Male Yes Sun Dinner 5\n", + " 216 28.15 3.00 Male Yes Sat Dinner 5),\n", + " (6, total_bill tip sex smoker day time size\n", + " 125 29.80 4.2 Female No Thur Lunch 6\n", + " 141 34.30 6.7 Male No Thur Lunch 6\n", + " 143 27.05 5.0 Female No Thur Lunch 6\n", + " 156 48.17 5.0 Male No Sun Dinner 6)]" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# If we coerce it to a list, we see something interesting: \n", "# It's basically a list of tuples! \n", @@ -890,9 +2052,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 106, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Female\n", + "total_bill 18.056897\n", + "tip 2.833448\n", + "size 2.459770\n", + "dtype: float64\n", + "Male\n", + "total_bill 20.744076\n", + "tip 3.089618\n", + "size 2.630573\n", + "dtype: float64\n" + ] + } + ], "source": [ "# We can iterate through the groupby just like we would a list of tuples!\n", "\n", @@ -921,28 +2100,83 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 113, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "sex\n", + "Female 44.30\n", + "Male 50.81\n", + "Name: total_bill, dtype: float64" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get the maximum bill by gender: \n", "\n", "def max_bill(df):\n", " return df.total_bill.max()\n", "\n", - "tips.groupby(\"sex\").apply(max_bill)" + "tips.groupby(\"sex\").apply(max_bill)\n", + "\n", + "tips.groupby(\"sex\").total_bill.max()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 121, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "sex\n", + "Female 48.33\n", + "Male 48.33\n", + "dtype: float64" + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge: \n", "\n", "# Get the second largest bill by gender!\n", - "# HINT: use sort_values and iloc!" + "# HINT: use sort_values and iloc!\n", + "\n", + "def snd(df):\n", + " return tips.sort_values('total_bill').iloc[-2,0]\n", + "\n", + "tips.groupby('sex').apply(snd)" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "50.81" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips.sort_values('total_bill').iloc[-1,0]" ] }, { @@ -961,9 +2195,74 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dayFriSatSunThur
sex
Female2.7811112.8017863.3672222.575625
Male2.6930003.0838983.2203452.980333
\n", + "
" + ], + "text/plain": [ + "day Fri Sat Sun Thur\n", + "sex \n", + "Female 2.781111 2.801786 3.367222 2.575625\n", + "Male 2.693000 3.083898 3.220345 2.980333" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge: \n", "# What is the mean tip, per day, for male vs. female?\n", @@ -972,7 +2271,7 @@ "def day_mean(df):\n", " # Hint: you will need to group by \"day\"\n", " # in this function, then get the mean tip. \n", - " pass\n", + " return df.groupby('day').tip.mean()\n", "\n", "\n", "tips.groupby(\"sex\").apply(day_mean)" @@ -998,9 +2297,271 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[(('Female', 'Fri'), total_bill tip sex smoker day time size\n", + " 92 5.75 1.00 Female Yes Fri Dinner 2\n", + " 93 16.32 4.30 Female Yes Fri Dinner 2\n", + " 94 22.75 3.25 Female No Fri Dinner 2\n", + " 100 11.35 2.50 Female Yes Fri Dinner 2\n", + " 101 15.38 3.00 Female Yes Fri Dinner 2\n", + " 221 13.42 3.48 Female Yes Fri Lunch 2\n", + " 223 15.98 3.00 Female No Fri Lunch 3\n", + " 225 16.27 2.50 Female Yes Fri Lunch 2\n", + " 226 10.09 2.00 Female Yes Fri Lunch 2),\n", + " (('Female', 'Sat'), total_bill tip sex smoker day time size\n", + " 21 20.29 2.75 Female No Sat Dinner 2\n", + " 22 15.77 2.23 Female No Sat Dinner 2\n", + " 29 19.65 3.00 Female No Sat Dinner 2\n", + " 32 15.06 3.00 Female No Sat Dinner 2\n", + " 33 20.69 2.45 Female No Sat Dinner 4\n", + " 37 16.93 3.07 Female No Sat Dinner 3\n", + " 57 26.41 1.50 Female No Sat Dinner 2\n", + " 66 16.45 2.47 Female No Sat Dinner 2\n", + " 67 3.07 1.00 Female Yes Sat Dinner 1\n", + " 71 17.07 3.00 Female No Sat Dinner 3\n", + " 72 26.86 3.14 Female Yes Sat Dinner 2\n", + " 73 25.28 5.00 Female Yes Sat Dinner 2\n", + " 74 14.73 2.20 Female No Sat Dinner 2\n", + " 102 44.30 2.50 Female Yes Sat Dinner 3\n", + " 103 22.42 3.48 Female Yes Sat Dinner 2\n", + " 104 20.92 4.08 Female No Sat Dinner 2\n", + " 109 14.31 4.00 Female Yes Sat Dinner 2\n", + " 111 7.25 1.00 Female No Sat Dinner 1\n", + " 168 10.59 1.61 Female Yes Sat Dinner 2\n", + " 169 10.63 2.00 Female Yes Sat Dinner 2\n", + " 209 12.76 2.23 Female Yes Sat Dinner 2\n", + " 213 13.27 2.50 Female Yes Sat Dinner 2\n", + " 214 28.17 6.50 Female Yes Sat Dinner 3\n", + " 215 12.90 1.10 Female Yes Sat Dinner 2\n", + " 219 30.14 3.09 Female Yes Sat Dinner 4\n", + " 229 22.12 2.88 Female Yes Sat Dinner 2\n", + " 238 35.83 4.67 Female No Sat Dinner 3\n", + " 240 27.18 2.00 Female Yes Sat Dinner 2),\n", + " (('Female', 'Sun'), total_bill tip sex smoker day time size\n", + " 0 16.99 1.01 Female No Sun Dinner 2\n", + " 4 24.59 3.61 Female No Sun Dinner 4\n", + " 11 35.26 5.00 Female No Sun Dinner 4\n", + " 14 14.83 3.02 Female No Sun Dinner 2\n", + " 16 10.33 1.67 Female No Sun Dinner 3\n", + " 18 16.97 3.50 Female No Sun Dinner 3\n", + " 51 10.29 2.60 Female No Sun Dinner 2\n", + " 52 34.81 5.20 Female No Sun Dinner 4\n", + " 114 25.71 4.00 Female No Sun Dinner 3\n", + " 115 17.31 3.50 Female No Sun Dinner 2\n", + " 155 29.85 5.14 Female No Sun Dinner 5\n", + " 157 25.00 3.75 Female No Sun Dinner 4\n", + " 158 13.39 2.61 Female No Sun Dinner 2\n", + " 162 16.21 2.00 Female No Sun Dinner 3\n", + " 164 17.51 3.00 Female Yes Sun Dinner 2\n", + " 178 9.60 4.00 Female Yes Sun Dinner 2\n", + " 186 20.90 3.50 Female Yes Sun Dinner 3\n", + " 188 18.15 3.50 Female Yes Sun Dinner 3),\n", + " (('Female', 'Thur'), total_bill tip sex smoker day time size\n", + " 82 10.07 1.83 Female No Thur Lunch 1\n", + " 85 34.83 5.17 Female No Thur Lunch 4\n", + " 117 10.65 1.50 Female No Thur Lunch 2\n", + " 118 12.43 1.80 Female No Thur Lunch 2\n", + " 119 24.08 2.92 Female No Thur Lunch 4\n", + " 121 13.42 1.68 Female No Thur Lunch 2\n", + " 124 12.48 2.52 Female No Thur Lunch 2\n", + " 125 29.80 4.20 Female No Thur Lunch 6\n", + " 127 14.52 2.00 Female No Thur Lunch 2\n", + " 128 11.38 2.00 Female No Thur Lunch 2\n", + " 131 20.27 2.83 Female No Thur Lunch 2\n", + " 132 11.17 1.50 Female No Thur Lunch 2\n", + " 133 12.26 2.00 Female No Thur Lunch 2\n", + " 134 18.26 3.25 Female No Thur Lunch 2\n", + " 135 8.51 1.25 Female No Thur Lunch 2\n", + " 136 10.33 2.00 Female No Thur Lunch 2\n", + " 137 14.15 2.00 Female No Thur Lunch 2\n", + " 139 13.16 2.75 Female No Thur Lunch 2\n", + " 140 17.47 3.50 Female No Thur Lunch 2\n", + " 143 27.05 5.00 Female No Thur Lunch 6\n", + " 144 16.43 2.30 Female No Thur Lunch 2\n", + " 145 8.35 1.50 Female No Thur Lunch 2\n", + " 146 18.64 1.36 Female No Thur Lunch 3\n", + " 147 11.87 1.63 Female No Thur Lunch 2\n", + " 191 19.81 4.19 Female Yes Thur Lunch 2\n", + " 197 43.11 5.00 Female Yes Thur Lunch 4\n", + " 198 13.00 2.00 Female Yes Thur Lunch 2\n", + " 201 12.74 2.01 Female Yes Thur Lunch 2\n", + " 202 13.00 2.00 Female Yes Thur Lunch 2\n", + " 203 16.40 2.50 Female Yes Thur Lunch 2\n", + " 205 16.47 3.23 Female Yes Thur Lunch 3\n", + " 243 18.78 3.00 Female No Thur Dinner 2),\n", + " (('Male', 'Fri'), total_bill tip sex smoker day time size\n", + " 90 28.97 3.00 Male Yes Fri Dinner 2\n", + " 91 22.49 3.50 Male No Fri Dinner 2\n", + " 95 40.17 4.73 Male Yes Fri Dinner 4\n", + " 96 27.28 4.00 Male Yes Fri Dinner 2\n", + " 97 12.03 1.50 Male Yes Fri Dinner 2\n", + " 98 21.01 3.00 Male Yes Fri Dinner 2\n", + " 99 12.46 1.50 Male No Fri Dinner 2\n", + " 220 12.16 2.20 Male Yes Fri Lunch 2\n", + " 222 8.58 1.92 Male Yes Fri Lunch 1\n", + " 224 13.42 1.58 Male Yes Fri Lunch 2),\n", + " (('Male', 'Sat'), total_bill tip sex smoker day time size\n", + " 19 20.65 3.35 Male No Sat Dinner 3\n", + " 20 17.92 4.08 Male No Sat Dinner 2\n", + " 23 39.42 7.58 Male No Sat Dinner 4\n", + " 24 19.82 3.18 Male No Sat Dinner 2\n", + " 25 17.81 2.34 Male No Sat Dinner 4\n", + " 26 13.37 2.00 Male No Sat Dinner 2\n", + " 27 12.69 2.00 Male No Sat Dinner 2\n", + " 28 21.70 4.30 Male No Sat Dinner 2\n", + " 30 9.55 1.45 Male No Sat Dinner 2\n", + " 31 18.35 2.50 Male No Sat Dinner 4\n", + " 34 17.78 3.27 Male No Sat Dinner 2\n", + " 35 24.06 3.60 Male No Sat Dinner 3\n", + " 36 16.31 2.00 Male No Sat Dinner 3\n", + " 38 18.69 2.31 Male No Sat Dinner 3\n", + " 39 31.27 5.00 Male No Sat Dinner 3\n", + " 40 16.04 2.24 Male No Sat Dinner 3\n", + " 56 38.01 3.00 Male Yes Sat Dinner 4\n", + " 58 11.24 1.76 Male Yes Sat Dinner 2\n", + " 59 48.27 6.73 Male No Sat Dinner 4\n", + " 60 20.29 3.21 Male Yes Sat Dinner 2\n", + " 61 13.81 2.00 Male Yes Sat Dinner 2\n", + " 62 11.02 1.98 Male Yes Sat Dinner 2\n", + " 63 18.29 3.76 Male Yes Sat Dinner 4\n", + " 64 17.59 2.64 Male No Sat Dinner 3\n", + " 65 20.08 3.15 Male No Sat Dinner 3\n", + " 68 20.23 2.01 Male No Sat Dinner 2\n", + " 69 15.01 2.09 Male Yes Sat Dinner 2\n", + " 70 12.02 1.97 Male No Sat Dinner 2\n", + " 75 10.51 1.25 Male No Sat Dinner 2\n", + " 76 17.92 3.08 Male Yes Sat Dinner 2\n", + " 105 15.36 1.64 Male Yes Sat Dinner 2\n", + " 106 20.49 4.06 Male Yes Sat Dinner 2\n", + " 107 25.21 4.29 Male Yes Sat Dinner 2\n", + " 108 18.24 3.76 Male No Sat Dinner 2\n", + " 110 14.00 3.00 Male No Sat Dinner 2\n", + " 170 50.81 10.00 Male Yes Sat Dinner 3\n", + " 171 15.81 3.16 Male Yes Sat Dinner 2\n", + " 206 26.59 3.41 Male Yes Sat Dinner 3\n", + " 207 38.73 3.00 Male Yes Sat Dinner 4\n", + " 208 24.27 2.03 Male Yes Sat Dinner 2\n", + " 210 30.06 2.00 Male Yes Sat Dinner 3\n", + " 211 25.89 5.16 Male Yes Sat Dinner 4\n", + " 212 48.33 9.00 Male No Sat Dinner 4\n", + " 216 28.15 3.00 Male Yes Sat Dinner 5\n", + " 217 11.59 1.50 Male Yes Sat Dinner 2\n", + " 218 7.74 1.44 Male Yes Sat Dinner 2\n", + " 227 20.45 3.00 Male No Sat Dinner 4\n", + " 228 13.28 2.72 Male No Sat Dinner 2\n", + " 230 24.01 2.00 Male Yes Sat Dinner 4\n", + " 231 15.69 3.00 Male Yes Sat Dinner 3\n", + " 232 11.61 3.39 Male No Sat Dinner 2\n", + " 233 10.77 1.47 Male No Sat Dinner 2\n", + " 234 15.53 3.00 Male Yes Sat Dinner 2\n", + " 235 10.07 1.25 Male No Sat Dinner 2\n", + " 236 12.60 1.00 Male Yes Sat Dinner 2\n", + " 237 32.83 1.17 Male Yes Sat Dinner 2\n", + " 239 29.03 5.92 Male No Sat Dinner 3\n", + " 241 22.67 2.00 Male Yes Sat Dinner 2\n", + " 242 17.82 1.75 Male No Sat Dinner 2),\n", + " (('Male', 'Sun'), total_bill tip sex smoker day time size\n", + " 1 10.34 1.66 Male No Sun Dinner 3\n", + " 2 21.01 3.50 Male No Sun Dinner 3\n", + " 3 23.68 3.31 Male No Sun Dinner 2\n", + " 5 25.29 4.71 Male No Sun Dinner 4\n", + " 6 8.77 2.00 Male No Sun Dinner 2\n", + " 7 26.88 3.12 Male No Sun Dinner 4\n", + " 8 15.04 1.96 Male No Sun Dinner 2\n", + " 9 14.78 3.23 Male No Sun Dinner 2\n", + " 10 10.27 1.71 Male No Sun Dinner 2\n", + " 12 15.42 1.57 Male No Sun Dinner 2\n", + " 13 18.43 3.00 Male No Sun Dinner 4\n", + " 15 21.58 3.92 Male No Sun Dinner 2\n", + " 17 16.29 3.71 Male No Sun Dinner 3\n", + " 41 17.46 2.54 Male No Sun Dinner 2\n", + " 42 13.94 3.06 Male No Sun Dinner 2\n", + " 43 9.68 1.32 Male No Sun Dinner 2\n", + " 44 30.40 5.60 Male No Sun Dinner 4\n", + " 45 18.29 3.00 Male No Sun Dinner 2\n", + " 46 22.23 5.00 Male No Sun Dinner 2\n", + " 47 32.40 6.00 Male No Sun Dinner 4\n", + " 48 28.55 2.05 Male No Sun Dinner 3\n", + " 49 18.04 3.00 Male No Sun Dinner 2\n", + " 50 12.54 2.50 Male No Sun Dinner 2\n", + " 53 9.94 1.56 Male No Sun Dinner 2\n", + " 54 25.56 4.34 Male No Sun Dinner 4\n", + " 55 19.49 3.51 Male No Sun Dinner 2\n", + " 112 38.07 4.00 Male No Sun Dinner 3\n", + " 113 23.95 2.55 Male No Sun Dinner 2\n", + " 116 29.93 5.07 Male No Sun Dinner 4\n", + " 150 14.07 2.50 Male No Sun Dinner 2\n", + " 151 13.13 2.00 Male No Sun Dinner 2\n", + " 152 17.26 2.74 Male No Sun Dinner 3\n", + " 153 24.55 2.00 Male No Sun Dinner 4\n", + " 154 19.77 2.00 Male No Sun Dinner 4\n", + " 156 48.17 5.00 Male No Sun Dinner 6\n", + " 159 16.49 2.00 Male No Sun Dinner 4\n", + " 160 21.50 3.50 Male No Sun Dinner 4\n", + " 161 12.66 2.50 Male No Sun Dinner 2\n", + " 163 13.81 2.00 Male No Sun Dinner 2\n", + " 165 24.52 3.48 Male No Sun Dinner 3\n", + " 166 20.76 2.24 Male No Sun Dinner 2\n", + " 167 31.71 4.50 Male No Sun Dinner 4\n", + " 172 7.25 5.15 Male Yes Sun Dinner 2\n", + " 173 31.85 3.18 Male Yes Sun Dinner 2\n", + " 174 16.82 4.00 Male Yes Sun Dinner 2\n", + " 175 32.90 3.11 Male Yes Sun Dinner 2\n", + " 176 17.89 2.00 Male Yes Sun Dinner 2\n", + " 177 14.48 2.00 Male Yes Sun Dinner 2\n", + " 179 34.63 3.55 Male Yes Sun Dinner 2\n", + " 180 34.65 3.68 Male Yes Sun Dinner 4\n", + " 181 23.33 5.65 Male Yes Sun Dinner 2\n", + " 182 45.35 3.50 Male Yes Sun Dinner 3\n", + " 183 23.17 6.50 Male Yes Sun Dinner 4\n", + " 184 40.55 3.00 Male Yes Sun Dinner 2\n", + " 185 20.69 5.00 Male No Sun Dinner 5\n", + " 187 30.46 2.00 Male Yes Sun Dinner 5\n", + " 189 23.10 4.00 Male Yes Sun Dinner 3\n", + " 190 15.69 1.50 Male Yes Sun Dinner 2),\n", + " (('Male', 'Thur'), total_bill tip sex smoker day time size\n", + " 77 27.20 4.00 Male No Thur Lunch 4\n", + " 78 22.76 3.00 Male No Thur Lunch 2\n", + " 79 17.29 2.71 Male No Thur Lunch 2\n", + " 80 19.44 3.00 Male Yes Thur Lunch 2\n", + " 81 16.66 3.40 Male No Thur Lunch 2\n", + " 83 32.68 5.00 Male Yes Thur Lunch 2\n", + " 84 15.98 2.03 Male No Thur Lunch 2\n", + " 86 13.03 2.00 Male No Thur Lunch 2\n", + " 87 18.28 4.00 Male No Thur Lunch 2\n", + " 88 24.71 5.85 Male No Thur Lunch 2\n", + " 89 21.16 3.00 Male No Thur Lunch 2\n", + " 120 11.69 2.31 Male No Thur Lunch 2\n", + " 122 14.26 2.50 Male No Thur Lunch 2\n", + " 123 15.95 2.00 Male No Thur Lunch 2\n", + " 126 8.52 1.48 Male No Thur Lunch 2\n", + " 129 22.82 2.18 Male No Thur Lunch 3\n", + " 130 19.08 1.50 Male No Thur Lunch 2\n", + " 138 16.00 2.00 Male Yes Thur Lunch 2\n", + " 141 34.30 6.70 Male No Thur Lunch 6\n", + " 142 41.19 5.00 Male No Thur Lunch 5\n", + " 148 9.78 1.73 Male No Thur Lunch 2\n", + " 149 7.51 2.00 Male No Thur Lunch 2\n", + " 192 28.44 2.56 Male Yes Thur Lunch 2\n", + " 193 15.48 2.02 Male Yes Thur Lunch 2\n", + " 194 16.58 4.00 Male Yes Thur Lunch 2\n", + " 195 7.56 1.44 Male No Thur Lunch 2\n", + " 196 10.34 2.00 Male Yes Thur Lunch 2\n", + " 199 13.51 2.00 Male Yes Thur Lunch 2\n", + " 200 18.71 4.00 Male Yes Thur Lunch 3\n", + " 204 20.53 4.00 Male Yes Thur Lunch 4)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Take a look at the structure of the multiple groupby!\n", "\n", @@ -1133,9 +2694,70 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
screennameid_strtexthashtags
0nandanrao928374987Woah, pandas is so much fun #worldrocked #jawd...[worldrocked, jawdrop, ml]
1om98214039I eat linear models for breakfast #datascience...[datascience, ml, crossfit]
\n", + "
" + ], + "text/plain": [ + " screenname id_str text \\\n", + "0 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd... \n", + "1 om 98214039 I eat linear models for breakfast #datascience... \n", + "\n", + " hashtags \n", + "0 [worldrocked, jawdrop, ml] \n", + "1 [datascience, ml, crossfit] " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "raw_tweets = [{ \"screenname\": \"nandanrao\",\n", " \"id_str\": \"928374987\",\n", @@ -1165,9 +2787,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
screennameid_strtext
0nandanrao928374987Woah, pandas is so much fun #worldrocked #jawd...
1om98214039I eat linear models for breakfast #datascience...
\n", + "
" + ], + "text/plain": [ + " screenname id_str text\n", + "0 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd...\n", + "1 om 98214039 I eat linear models for breakfast #datascience..." + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "tweets = pd.DataFrame(raw_tweets, columns = [\"screenname\", \"id_str\", \"text\"])\n", "tweets" @@ -1175,9 +2851,84 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_strhashtag
0928374987worldrocked
1928374987jawdrop
2928374987ml
398214039datascience
498214039ml
598214039crossfit
\n", + "
" + ], + "text/plain": [ + " id_str hashtag\n", + "0 928374987 worldrocked\n", + "1 928374987 jawdrop\n", + "2 928374987 ml\n", + "3 98214039 datascience\n", + "4 98214039 ml\n", + "5 98214039 crossfit" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "tags_and_ids = [(t['id_str'], tag) \n", " for t in raw_tweets \n", @@ -1190,11 +2941,108 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
screennameid_strtexthashtag
0nandanrao928374987Woah, pandas is so much fun #worldrocked #jawd...worldrocked
1nandanrao928374987Woah, pandas is so much fun #worldrocked #jawd...jawdrop
2nandanrao928374987Woah, pandas is so much fun #worldrocked #jawd...ml
3om98214039I eat linear models for breakfast #datascience...datascience
4om98214039I eat linear models for breakfast #datascience...ml
5om98214039I eat linear models for breakfast #datascience...crossfit
\n", + "
" + ], + "text/plain": [ + " screenname id_str text \\\n", + "0 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd... \n", + "1 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd... \n", + "2 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd... \n", + "3 om 98214039 I eat linear models for breakfast #datascience... \n", + "4 om 98214039 I eat linear models for breakfast #datascience... \n", + "5 om 98214039 I eat linear models for breakfast #datascience... \n", + "\n", + " hashtag \n", + "0 worldrocked \n", + "1 jawdrop \n", + "2 ml \n", + "3 datascience \n", + "4 ml \n", + "5 crossfit " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df = tweets.merge(hashtags, how='left')\n", + "df = tweets.merge(hashtags, how='inner')\n", "\n", "df" ] @@ -1218,6 +3066,678 @@ "\n", "*Needless to say that eyeballing is OK for making sure your code makes sense, but will not result in full credits for the project. We want a fully automated code. To carry out the project successfully you need to use most the attributes and methods described earlier. The last one is a little tricky*" ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#Import modules\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "#Open CSV files\n", + "df_p = pd.read_csv('supermarket_prices.csv')\n", + "df_t = pd.read_csv('supermarket_transactions.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ProductPrice
0tomato2.1
1potato3.4
2apple1.2
3orange4.3
4banana5.2
\n", + "
" + ], + "text/plain": [ + " Product Price\n", + "0 tomato 2.1\n", + "1 potato 3.4\n", + "2 apple 1.2\n", + "3 orange 4.3\n", + "4 banana 5.2" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BuyerProductQuantity
0Jacksonapple4
1Jacksonapple9
2Johnorange9
3Johnpotato10
4Tomtomato4
............
94Sophiaapple7
95Jacksonpotato8
96Liampotato2
97Sophiapotato6
98Johnorange10
\n", + "

99 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Buyer Product Quantity\n", + "0 Jackson apple 4\n", + "1 Jackson apple 9\n", + "2 John orange 9\n", + "3 John potato 10\n", + "4 Tom tomato 4\n", + ".. ... ... ...\n", + "94 Sophia apple 7\n", + "95 Jackson potato 8\n", + "96 Liam potato 2\n", + "97 Sophia potato 6\n", + "98 John orange 10\n", + "\n", + "[99 rows x 3 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(df_p)\n", + "display(df_t)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Buyer\n", + "Emma 81\n", + "Jackson 70\n", + "John 122\n", + "Liam 81\n", + "Lucas 62\n", + "Sandra 78\n", + "Sophia 61\n", + "Tom 49\n", + "Name: Quantity, dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#How many items each client has purchased\n", + "df_t.groupby('Buyer').Quantity.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Buyer Product\n", + "Emma apple 4\n", + " banana 4\n", + " potato 2\n", + " tomato 2\n", + "Jackson orange 5\n", + " apple 3\n", + " tomato 3\n", + " potato 1\n", + "John orange 7\n", + " banana 4\n", + " tomato 3\n", + " potato 2\n", + " apple 1\n", + "Liam banana 4\n", + " apple 3\n", + " potato 3\n", + " orange 2\n", + " tomato 1\n", + "Lucas orange 3\n", + " tomato 3\n", + " apple 2\n", + " potato 2\n", + " banana 1\n", + "Sandra orange 5\n", + " potato 4\n", + " banana 1\n", + " tomato 1\n", + "Sophia apple 3\n", + " banana 3\n", + " orange 2\n", + " potato 2\n", + " tomato 2\n", + "Tom apple 4\n", + " potato 3\n", + " tomato 3\n", + " banana 1\n", + "Name: Product, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#How many items of each type each client has purchased\n", + "df_t.groupby('Buyer').Product.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BuyerProductQuantityPrice
0Jacksonapple41.2
1Jacksonapple91.2
2Johnorange94.3
3Johnpotato103.4
4Tomtomato42.1
...............
94Sophiaapple71.2
95Jacksonpotato83.4
96Liampotato23.4
97Sophiapotato63.4
98Johnorange104.3
\n", + "

99 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " Buyer Product Quantity Price\n", + "0 Jackson apple 4 1.2\n", + "1 Jackson apple 9 1.2\n", + "2 John orange 9 4.3\n", + "3 John potato 10 3.4\n", + "4 Tom tomato 4 2.1\n", + ".. ... ... ... ...\n", + "94 Sophia apple 7 1.2\n", + "95 Jackson potato 8 3.4\n", + "96 Liam potato 2 3.4\n", + "97 Sophia potato 6 3.4\n", + "98 John orange 10 4.3\n", + "\n", + "[99 rows x 4 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Calculate the total amount spent by each client\n", + "\n", + "#To do this I need to start by joining the two datasets I have \n", + "\n", + "df = df_t.merge(df_p, how = 'left' ,on = 'Product')\n", + "display(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BuyerProductQuantityPriceSpending
0Jacksonapple41.24.8
1Jacksonapple91.210.8
2Johnorange94.338.7
3Johnpotato103.434.0
4Tomtomato42.18.4
..................
94Sophiaapple71.28.4
95Jacksonpotato83.427.2
96Liampotato23.46.8
97Sophiapotato63.420.4
98Johnorange104.343.0
\n", + "

99 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " Buyer Product Quantity Price Spending\n", + "0 Jackson apple 4 1.2 4.8\n", + "1 Jackson apple 9 1.2 10.8\n", + "2 John orange 9 4.3 38.7\n", + "3 John potato 10 3.4 34.0\n", + "4 Tom tomato 4 2.1 8.4\n", + ".. ... ... ... ... ...\n", + "94 Sophia apple 7 1.2 8.4\n", + "95 Jackson potato 8 3.4 27.2\n", + "96 Liam potato 2 3.4 6.8\n", + "97 Sophia potato 6 3.4 20.4\n", + "98 John orange 10 4.3 43.0\n", + "\n", + "[99 rows x 5 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Calculate the total amount spent by each client\n", + "df['Spending'] = df.Quantity*df.Price\n", + "display(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Buyer\n", + "Emma 246.4\n", + "Jackson 202.8\n", + "John 461.3\n", + "Liam 263.3\n", + "Lucas 176.0\n", + "Sandra 300.8\n", + "Sophia 189.4\n", + "Tom 126.1\n", + "Name: Spending, dtype: float64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Calculate the total amount spent by each client\n", + "df.groupby('Buyer').Spending.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Buyer\n", + "Emma 135.2\n", + "John 145.6\n", + "Liam 83.2\n", + "Lucas 15.6\n", + "Sandra 10.4\n", + "Sophia 67.6\n", + "Tom 31.2\n", + "Name: Spending, dtype: float64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Part 1:\n", + "# The company that provides the supermarket with bananas wishes to give a prize to the client that \n", + "# has spent the largest proportion of their spending on bananas. Who should win the prize?\n", + "\n", + "df[df.Product == 'banana'].groupby('Buyer').Spending.sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Part 2:\n", + "# A marketing company that works with the supermarket is interested to understand better \n", + "# the characteristics of the three people that have spent most of their spending on bananas. \n", + "# For each one of them report the other product that they have spent most of their remaining income on" + ] } ], "metadata": { @@ -1237,7 +3757,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.3" + "version": "3.7.3" } }, "nbformat": 4,