diff --git a/pandas.ipynb b/pandas.ipynb
index f0a8a33..c20528c 100644
--- a/pandas.ipynb
+++ b/pandas.ipynb
@@ -15,7 +15,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -32,7 +32,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -153,15 +153,169 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " total_bill | \n",
+ " tip | \n",
+ " sex | \n",
+ " smoker | \n",
+ " day | \n",
+ " time | \n",
+ " size | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 16.99 | \n",
+ " 1.01 | \n",
+ " Female | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 10.34 | \n",
+ " 1.66 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 21.01 | \n",
+ " 3.50 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 23.68 | \n",
+ " 3.31 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 24.59 | \n",
+ " 3.61 | \n",
+ " Female | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 25.29 | \n",
+ " 4.71 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 8.77 | \n",
+ " 2.00 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 26.88 | \n",
+ " 3.12 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 15.04 | \n",
+ " 1.96 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 14.78 | \n",
+ " 3.23 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " total_bill tip sex smoker day time size\n",
+ "0 16.99 1.01 Female No Sun Dinner 2\n",
+ "1 10.34 1.66 Male No Sun Dinner 3\n",
+ "2 21.01 3.50 Male No Sun Dinner 3\n",
+ "3 23.68 3.31 Male No Sun Dinner 2\n",
+ "4 24.59 3.61 Female No Sun Dinner 4\n",
+ "5 25.29 4.71 Male No Sun Dinner 4\n",
+ "6 8.77 2.00 Male No Sun Dinner 2\n",
+ "7 26.88 3.12 Male No Sun Dinner 4\n",
+ "8 15.04 1.96 Male No Sun Dinner 2\n",
+ "9 14.78 3.23 Male No Sun Dinner 2"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"import pandas as pd\n",
"\n",
"# load a dataframe from disk\n",
"tips = pd.read_csv(\"tips.csv\")\n",
- "tips.head(5)"
+ "tips.head(10)"
]
},
{
@@ -180,6 +334,25 @@
"\n"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(type(tips.total_bill))\n",
+ "print(type(tips))"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {
@@ -203,7 +376,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@@ -213,9 +386,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.0"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Accessing a certain value via the index\n",
"\n",
@@ -224,21 +408,78 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[ 1. 15. -5. nan 4. 123. 0. 78. 0. 5. -4.]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([ 1.01, 1.66, 3.5 , 3.31, 3.61, 4.71, 2. , 3.12, 1.96,\n",
+ " 3.23, 1.71, 5. , 1.57, 3. , 3.02, 3.92, 1.67, 3.71,\n",
+ " 3.5 , 3.35, 4.08, 2.75, 2.23, 7.58, 3.18, 2.34, 2. ,\n",
+ " 2. , 4.3 , 3. , 1.45, 2.5 , 3. , 2.45, 3.27, 3.6 ,\n",
+ " 2. , 3.07, 2.31, 5. , 2.24, 2.54, 3.06, 1.32, 5.6 ,\n",
+ " 3. , 5. , 6. , 2.05, 3. , 2.5 , 2.6 , 5.2 , 1.56,\n",
+ " 4.34, 3.51, 3. , 1.5 , 1.76, 6.73, 3.21, 2. , 1.98,\n",
+ " 3.76, 2.64, 3.15, 2.47, 1. , 2.01, 2.09, 1.97, 3. ,\n",
+ " 3.14, 5. , 2.2 , 1.25, 3.08, 4. , 3. , 2.71, 3. ,\n",
+ " 3.4 , 1.83, 5. , 2.03, 5.17, 2. , 4. , 5.85, 3. ,\n",
+ " 3. , 3.5 , 1. , 4.3 , 3.25, 4.73, 4. , 1.5 , 3. ,\n",
+ " 1.5 , 2.5 , 3. , 2.5 , 3.48, 4.08, 1.64, 4.06, 4.29,\n",
+ " 3.76, 4. , 3. , 1. , 4. , 2.55, 4. , 3.5 , 5.07,\n",
+ " 1.5 , 1.8 , 2.92, 2.31, 1.68, 2.5 , 2. , 2.52, 4.2 ,\n",
+ " 1.48, 2. , 2. , 2.18, 1.5 , 2.83, 1.5 , 2. , 3.25,\n",
+ " 1.25, 2. , 2. , 2. , 2.75, 3.5 , 6.7 , 5. , 5. ,\n",
+ " 2.3 , 1.5 , 1.36, 1.63, 1.73, 2. , 2.5 , 2. , 2.74,\n",
+ " 2. , 2. , 5.14, 5. , 3.75, 2.61, 2. , 3.5 , 2.5 ,\n",
+ " 2. , 2. , 3. , 3.48, 2.24, 4.5 , 1.61, 2. , 10. ,\n",
+ " 3.16, 5.15, 3.18, 4. , 3.11, 2. , 2. , 4. , 3.55,\n",
+ " 3.68, 5.65, 3.5 , 6.5 , 3. , 5. , 3.5 , 2. , 3.5 ,\n",
+ " 4. , 1.5 , 4.19, 2.56, 2.02, 4. , 1.44, 2. , 5. ,\n",
+ " 2. , 2. , 4. , 2.01, 2. , 2.5 , 4. , 3.23, 3.41,\n",
+ " 3. , 2.03, 2.23, 2. , 5.16, 9. , 2.5 , 6.5 , 1.1 ,\n",
+ " 3. , 1.5 , 1.44, 3.09, 2.2 , 3.48, 1.92, 3. , 1.58,\n",
+ " 2.5 , 2. , 3. , 2.72, 2.88, 2. , 3. , 3.39, 1.47,\n",
+ " 3. , 1.25, 1. , 1.17, 4.67, 5.92, 2. , 2. , 1.75,\n",
+ " 3. ])"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Note that there are a bunch of attributes.\n",
"# .values returns a numpy ndarray of the values! \n",
"\n",
- "my_series.values"
+ "print(my_series.values)\n",
+ "tips.tip.values"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "RangeIndex(start=0, stop=11, step=1)"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Take a look at the index. What type is it? \n",
"# You convert itto a numpy ndarray by adding \".values\" again!\n",
@@ -248,9 +489,31 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "om 1.0\n",
+ "ir 15.0\n",
+ "os -5.0\n",
+ "pap NaN\n",
+ "pas 4.0\n",
+ "pil 123.0\n",
+ "io 0.0\n",
+ "po 78.0\n",
+ "ulos 0.0\n",
+ "is 5.0\n",
+ "best -4.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# You can overwrite the index directly: \n",
"\n",
@@ -276,9 +539,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1.0, 1.0)"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Series that have string indices can also be accessed via a RangeIndex\n",
"# (which is similar to the index of a regular Python list)\n",
@@ -288,9 +562,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1.0, -5.0)"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Note that indices can get moved around, by sorting for example!\n",
"# iloc gives you the element you would get if the Series\n",
@@ -321,7 +606,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -364,7 +649,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -414,7 +699,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
@@ -426,9 +711,31 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 22,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "om False\n",
+ "ir False\n",
+ "os False\n",
+ "pap False\n",
+ "pas False\n",
+ "pil False\n",
+ "io True\n",
+ "po False\n",
+ "ulos True\n",
+ "is False\n",
+ "best False\n",
+ "dtype: bool"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# getting a boolean-valued series by checking a condition\n",
"\n",
@@ -438,14 +745,30 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 26,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "io 0.0\n",
+ "ulos 0.0\n",
+ "dtype: float64\n",
+ "io 0.0\n",
+ "ulos 0.0\n",
+ "dtype: float64\n"
+ ]
+ }
+ ],
"source": [
"# Notice the index of x is a SUBSET of the index of \"my_series\"\n",
"# This can be useful when needing to relate values back to the original \"my_series\"!\n",
"\n",
- "x = my_series[choose]"
+ "x = my_series[choose]\n",
+ "print(x)\n",
+ "y = my_series[my_series == 0]\n",
+ "print(y)"
]
},
{
@@ -468,16 +791,34 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 43,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "om 1.0\n",
+ "ir 15.0\n",
+ "os -5.0\n",
+ "pas 4.0\n",
+ "pil 123.0\n",
+ "po 78.0\n",
+ "is 5.0\n",
+ "best -4.0\n",
+ "dtype: float64\n"
+ ]
+ }
+ ],
"source": [
"# Challenge: \n",
"\n",
"# Filter \"my_series\" to be all the elements that are NOT\n",
"# equal to 0, using the \"choose\" boolean mask below: \n",
"\n",
- "choose = my_series == 0.0\n"
+ "choose = my_series == 0.0\n",
+ "not_zero = my_series[(~choose) & (~np.isnan(my_series))]\n",
+ "print(not_zero)\n"
]
},
{
@@ -543,9 +884,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 48,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 foo\n",
+ "1 bar\n",
+ "3 baz\n",
+ "4 qux\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Challenge: \n",
"# Get a list of names, without the Null values!\n",
@@ -554,7 +910,8 @@
"# 1. Create a boolean mask by using the .notna() method.\n",
"# 2. Use the mask to subset the Series.\n",
"\n",
- "names = pd.Series(['foo','bar',None,'baz','qux',None])\n"
+ "names = pd.Series(['foo','bar',None,'baz','qux',None])\n",
+ "names[names.notna()]\n"
]
},
{
@@ -583,9 +940,29 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 67,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 foo\n",
+ "1 bar\n",
+ "2 None\n",
+ "3 foo\n",
+ "4 None\n",
+ "5 bar\n",
+ "6 bar\n",
+ "7 foo\n",
+ "8 None\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Challenge: \n",
"\n",
@@ -598,23 +975,41 @@
" # Your code here\n",
" # HINT: delete the \"pass\" when your done\n",
" # HINT2: handle None values!\n",
- " pass\n",
- "\n",
+ " try: \n",
+ " return s.lower()\n",
+ " except AttributeError:\n",
+ " return None\n",
+ " \n",
"\n",
- "names = pd.Series(['Foo', 'BAR', None, 'foo', None, 'bar', 'bAR', 'foo', None])"
+ "names = pd.Series(['Foo', 'BAR', None, 'foo', None, 'bar', 'bAR', 'foo', None])\n",
+ "names.map(lower)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 69,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "bar 3\n",
+ "foo 3\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 69,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Challenge: \n",
"\n",
"# Using the series from above, now lowercased, count the occurences of each name\n",
"# Hint: It's simple, just use .value_counts()!\n",
- "\n"
+ "lowered = names.map(lower)\n",
+ "lowered.value_counts()\n"
]
},
{
@@ -636,19 +1031,185 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " total_bill | \n",
+ " tip | \n",
+ " sex | \n",
+ " smoker | \n",
+ " day | \n",
+ " time | \n",
+ " size | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 16.99 | \n",
+ " 1.01 | \n",
+ " Female | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 10.34 | \n",
+ " 1.66 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 21.01 | \n",
+ " 3.50 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 23.68 | \n",
+ " 3.31 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 24.59 | \n",
+ " 3.61 | \n",
+ " Female | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 25.29 | \n",
+ " 4.71 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 8.77 | \n",
+ " 2.00 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 26.88 | \n",
+ " 3.12 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 15.04 | \n",
+ " 1.96 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 14.78 | \n",
+ " 3.23 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " total_bill tip sex smoker day time size\n",
+ "0 16.99 1.01 Female No Sun Dinner 2\n",
+ "1 10.34 1.66 Male No Sun Dinner 3\n",
+ "2 21.01 3.50 Male No Sun Dinner 3\n",
+ "3 23.68 3.31 Male No Sun Dinner 2\n",
+ "4 24.59 3.61 Female No Sun Dinner 4\n",
+ "5 25.29 4.71 Male No Sun Dinner 4\n",
+ "6 8.77 2.00 Male No Sun Dinner 2\n",
+ "7 26.88 3.12 Male No Sun Dinner 4\n",
+ "8 15.04 1.96 Male No Sun Dinner 2\n",
+ "9 14.78 3.23 Male No Sun Dinner 2"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
+ "import pandas as pd\n",
"tips = pd.read_csv(\"tips.csv\")\n",
"tips.head(10) # the first method of our dataframe object! "
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 73,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# the other important attribute: name of rows and columns\n",
"tips.index\n",
@@ -679,6 +1240,26 @@
"```"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.48929877523035775"
+ ]
+ },
+ "execution_count": 74,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tips['size'].corr(tips.tip)"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {
@@ -733,21 +1314,133 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 78,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sex | \n",
+ " smoker | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " Male | \n",
+ " No | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Male | \n",
+ " No | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sex smoker\n",
+ "1 Male No\n",
+ "3 Male No"
+ ]
+ },
+ "execution_count": 78,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Accessing rows AND columns!\n",
"# Example of 2-dimension loc\n",
"\n",
+ "\n",
"tips.loc[[1,3], ['sex', 'smoker']]"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 81,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sex | \n",
+ " smoker | \n",
+ " day | \n",
+ " time | \n",
+ " size | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Male | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sex smoker day time size\n",
+ "1 Male No Sun Dinner 3\n",
+ "3 Male No Sun Dinner 2"
+ ]
+ },
+ "execution_count": 81,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Accessing rows AND columns!\n",
"# Example of 2-dimensional iloc\n",
@@ -757,15 +1450,232 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 83,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tip | \n",
+ " sex | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 20 | \n",
+ " 4.08 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " 2.75 | \n",
+ " Female | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " 2.23 | \n",
+ " Female | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " 7.58 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " 3.18 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 25 | \n",
+ " 2.34 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 26 | \n",
+ " 2.00 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 27 | \n",
+ " 2.00 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " 4.30 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 29 | \n",
+ " 3.00 | \n",
+ " Female | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 30 | \n",
+ " 1.45 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 31 | \n",
+ " 2.50 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 32 | \n",
+ " 3.00 | \n",
+ " Female | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 33 | \n",
+ " 2.45 | \n",
+ " Female | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 34 | \n",
+ " 3.27 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 35 | \n",
+ " 3.60 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 36 | \n",
+ " 2.00 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 37 | \n",
+ " 3.07 | \n",
+ " Female | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 38 | \n",
+ " 2.31 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 39 | \n",
+ " 5.00 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 40 | \n",
+ " 2.24 | \n",
+ " Male | \n",
+ " Sat | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " 2.54 | \n",
+ " Male | \n",
+ " Sun | \n",
+ "
\n",
+ " \n",
+ " | 42 | \n",
+ " 3.06 | \n",
+ " Male | \n",
+ " Sun | \n",
+ "
\n",
+ " \n",
+ " | 43 | \n",
+ " 1.32 | \n",
+ " Male | \n",
+ " Sun | \n",
+ "
\n",
+ " \n",
+ " | 44 | \n",
+ " 5.60 | \n",
+ " Male | \n",
+ " Sun | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tip sex day\n",
+ "20 4.08 Male Sat\n",
+ "21 2.75 Female Sat\n",
+ "22 2.23 Female Sat\n",
+ "23 7.58 Male Sat\n",
+ "24 3.18 Male Sat\n",
+ "25 2.34 Male Sat\n",
+ "26 2.00 Male Sat\n",
+ "27 2.00 Male Sat\n",
+ "28 4.30 Male Sat\n",
+ "29 3.00 Female Sat\n",
+ "30 1.45 Male Sat\n",
+ "31 2.50 Male Sat\n",
+ "32 3.00 Female Sat\n",
+ "33 2.45 Female Sat\n",
+ "34 3.27 Male Sat\n",
+ "35 3.60 Male Sat\n",
+ "36 2.00 Male Sat\n",
+ "37 3.07 Female Sat\n",
+ "38 2.31 Male Sat\n",
+ "39 5.00 Male Sat\n",
+ "40 2.24 Male Sat\n",
+ "41 2.54 Male Sun\n",
+ "42 3.06 Male Sun\n",
+ "43 1.32 Male Sun\n",
+ "44 5.60 Male Sun"
+ ]
+ },
+ "execution_count": 83,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Challenge:\n",
"\n",
"# Using the tips dataframe, create a new one that contains the \n",
"# information contained in all rows between the 20th (inclusive) \n",
- "# and the 45th (exclusive) and only the columns: tip, sex, day"
+ "# and the 45th (exclusive) and only the columns: tip, sex, day\n",
+ "\n",
+ "tips.loc[20:44, ['tip', 'sex', 'day']]"
]
},
{
@@ -834,9 +1744,27 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 100,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.40463164280330477\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.40463164280330477"
+ ]
+ },
+ "execution_count": 100,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Challenge:\n",
"\n",
@@ -844,7 +1772,43 @@
"# tip and size for only Male clients during Dinner. \n",
"\n",
"# HINT: Remember that \"size\" cannot be accessed via dot notation, as it's an \n",
- "# attribute of the series!"
+ "# attribute of the series!\n",
+ "\n",
+ "print(tips[(tips.sex == 'Male') & (tips.time == 'Dinner')].tip.corr(tips[(tips.sex == 'Male') & (tips.time == 'Dinner')]['size']))\n",
+ "\n",
+ "male_dinner = tips[(tips.sex == 'Male') & (tips.time == 'Dinner')]\n",
+ "male_dinner.tip.corr(male_dinner['size'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 Dinner\n",
+ "1 Dinner\n",
+ "2 Dinner\n",
+ "3 Dinner\n",
+ "4 Dinner\n",
+ " ... \n",
+ "239 Dinner\n",
+ "240 Dinner\n",
+ "241 Dinner\n",
+ "242 Dinner\n",
+ "243 Dinner\n",
+ "Name: time, Length: 244, dtype: object"
+ ]
+ },
+ "execution_count": 97,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tips.time"
]
},
{
@@ -864,21 +1828,219 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 104,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " total_bill | \n",
+ " tip | \n",
+ "
\n",
+ " \n",
+ " | size | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " 7.242500 | \n",
+ " 1.437500 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 16.448013 | \n",
+ " 2.582308 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 23.277632 | \n",
+ " 3.393158 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 28.613514 | \n",
+ " 4.135405 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 30.068000 | \n",
+ " 4.028000 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 34.830000 | \n",
+ " 5.225000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " total_bill tip\n",
+ "size \n",
+ "1 7.242500 1.437500\n",
+ "2 16.448013 2.582308\n",
+ "3 23.277632 3.393158\n",
+ "4 28.613514 4.135405\n",
+ "5 30.068000 4.028000\n",
+ "6 34.830000 5.225000"
+ ]
+ },
+ "execution_count": 104,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Group tips dataframe by size of table\n",
"by_size = tips.groupby(\"size\")\n",
"\n",
- "by_size"
+ "by_size.mean()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 105,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[(1, total_bill tip sex smoker day time size\n",
+ " 67 3.07 1.00 Female Yes Sat Dinner 1\n",
+ " 82 10.07 1.83 Female No Thur Lunch 1\n",
+ " 111 7.25 1.00 Female No Sat Dinner 1\n",
+ " 222 8.58 1.92 Male Yes Fri Lunch 1),\n",
+ " (2, total_bill tip sex smoker day time size\n",
+ " 0 16.99 1.01 Female No Sun Dinner 2\n",
+ " 3 23.68 3.31 Male No Sun Dinner 2\n",
+ " 6 8.77 2.00 Male No Sun Dinner 2\n",
+ " 8 15.04 1.96 Male No Sun Dinner 2\n",
+ " 9 14.78 3.23 Male No Sun Dinner 2\n",
+ " .. ... ... ... ... ... ... ...\n",
+ " 237 32.83 1.17 Male Yes Sat Dinner 2\n",
+ " 240 27.18 2.00 Female Yes Sat Dinner 2\n",
+ " 241 22.67 2.00 Male Yes Sat Dinner 2\n",
+ " 242 17.82 1.75 Male No Sat Dinner 2\n",
+ " 243 18.78 3.00 Female No Thur Dinner 2\n",
+ " \n",
+ " [156 rows x 7 columns]),\n",
+ " (3, total_bill tip sex smoker day time size\n",
+ " 1 10.34 1.66 Male No Sun Dinner 3\n",
+ " 2 21.01 3.50 Male No Sun Dinner 3\n",
+ " 16 10.33 1.67 Female No Sun Dinner 3\n",
+ " 17 16.29 3.71 Male No Sun Dinner 3\n",
+ " 18 16.97 3.50 Female No Sun Dinner 3\n",
+ " 19 20.65 3.35 Male No Sat Dinner 3\n",
+ " 35 24.06 3.60 Male No Sat Dinner 3\n",
+ " 36 16.31 2.00 Male No Sat Dinner 3\n",
+ " 37 16.93 3.07 Female No Sat Dinner 3\n",
+ " 38 18.69 2.31 Male No Sat Dinner 3\n",
+ " 39 31.27 5.00 Male No Sat Dinner 3\n",
+ " 40 16.04 2.24 Male No Sat Dinner 3\n",
+ " 48 28.55 2.05 Male No Sun Dinner 3\n",
+ " 64 17.59 2.64 Male No Sat Dinner 3\n",
+ " 65 20.08 3.15 Male No Sat Dinner 3\n",
+ " 71 17.07 3.00 Female No Sat Dinner 3\n",
+ " 102 44.30 2.50 Female Yes Sat Dinner 3\n",
+ " 112 38.07 4.00 Male No Sun Dinner 3\n",
+ " 114 25.71 4.00 Female No Sun Dinner 3\n",
+ " 129 22.82 2.18 Male No Thur Lunch 3\n",
+ " 146 18.64 1.36 Female No Thur Lunch 3\n",
+ " 152 17.26 2.74 Male No Sun Dinner 3\n",
+ " 162 16.21 2.00 Female No Sun Dinner 3\n",
+ " 165 24.52 3.48 Male No Sun Dinner 3\n",
+ " 170 50.81 10.00 Male Yes Sat Dinner 3\n",
+ " 182 45.35 3.50 Male Yes Sun Dinner 3\n",
+ " 186 20.90 3.50 Female Yes Sun Dinner 3\n",
+ " 188 18.15 3.50 Female Yes Sun Dinner 3\n",
+ " 189 23.10 4.00 Male Yes Sun Dinner 3\n",
+ " 200 18.71 4.00 Male Yes Thur Lunch 3\n",
+ " 205 16.47 3.23 Female Yes Thur Lunch 3\n",
+ " 206 26.59 3.41 Male Yes Sat Dinner 3\n",
+ " 210 30.06 2.00 Male Yes Sat Dinner 3\n",
+ " 214 28.17 6.50 Female Yes Sat Dinner 3\n",
+ " 223 15.98 3.00 Female No Fri Lunch 3\n",
+ " 231 15.69 3.00 Male Yes Sat Dinner 3\n",
+ " 238 35.83 4.67 Female No Sat Dinner 3\n",
+ " 239 29.03 5.92 Male No Sat Dinner 3),\n",
+ " (4, total_bill tip sex smoker day time size\n",
+ " 4 24.59 3.61 Female No Sun Dinner 4\n",
+ " 5 25.29 4.71 Male No Sun Dinner 4\n",
+ " 7 26.88 3.12 Male No Sun Dinner 4\n",
+ " 11 35.26 5.00 Female No Sun Dinner 4\n",
+ " 13 18.43 3.00 Male No Sun Dinner 4\n",
+ " 23 39.42 7.58 Male No Sat Dinner 4\n",
+ " 25 17.81 2.34 Male No Sat Dinner 4\n",
+ " 31 18.35 2.50 Male No Sat Dinner 4\n",
+ " 33 20.69 2.45 Female No Sat Dinner 4\n",
+ " 44 30.40 5.60 Male No Sun Dinner 4\n",
+ " 47 32.40 6.00 Male No Sun Dinner 4\n",
+ " 52 34.81 5.20 Female No Sun Dinner 4\n",
+ " 54 25.56 4.34 Male No Sun Dinner 4\n",
+ " 56 38.01 3.00 Male Yes Sat Dinner 4\n",
+ " 59 48.27 6.73 Male No Sat Dinner 4\n",
+ " 63 18.29 3.76 Male Yes Sat Dinner 4\n",
+ " 77 27.20 4.00 Male No Thur Lunch 4\n",
+ " 85 34.83 5.17 Female No Thur Lunch 4\n",
+ " 95 40.17 4.73 Male Yes Fri Dinner 4\n",
+ " 116 29.93 5.07 Male No Sun Dinner 4\n",
+ " 119 24.08 2.92 Female No Thur Lunch 4\n",
+ " 153 24.55 2.00 Male No Sun Dinner 4\n",
+ " 154 19.77 2.00 Male No Sun Dinner 4\n",
+ " 157 25.00 3.75 Female No Sun Dinner 4\n",
+ " 159 16.49 2.00 Male No Sun Dinner 4\n",
+ " 160 21.50 3.50 Male No Sun Dinner 4\n",
+ " 167 31.71 4.50 Male No Sun Dinner 4\n",
+ " 180 34.65 3.68 Male Yes Sun Dinner 4\n",
+ " 183 23.17 6.50 Male Yes Sun Dinner 4\n",
+ " 197 43.11 5.00 Female Yes Thur Lunch 4\n",
+ " 204 20.53 4.00 Male Yes Thur Lunch 4\n",
+ " 207 38.73 3.00 Male Yes Sat Dinner 4\n",
+ " 211 25.89 5.16 Male Yes Sat Dinner 4\n",
+ " 212 48.33 9.00 Male No Sat Dinner 4\n",
+ " 219 30.14 3.09 Female Yes Sat Dinner 4\n",
+ " 227 20.45 3.00 Male No Sat Dinner 4\n",
+ " 230 24.01 2.00 Male Yes Sat Dinner 4),\n",
+ " (5, total_bill tip sex smoker day time size\n",
+ " 142 41.19 5.00 Male No Thur Lunch 5\n",
+ " 155 29.85 5.14 Female No Sun Dinner 5\n",
+ " 185 20.69 5.00 Male No Sun Dinner 5\n",
+ " 187 30.46 2.00 Male Yes Sun Dinner 5\n",
+ " 216 28.15 3.00 Male Yes Sat Dinner 5),\n",
+ " (6, total_bill tip sex smoker day time size\n",
+ " 125 29.80 4.2 Female No Thur Lunch 6\n",
+ " 141 34.30 6.7 Male No Thur Lunch 6\n",
+ " 143 27.05 5.0 Female No Thur Lunch 6\n",
+ " 156 48.17 5.0 Male No Sun Dinner 6)]"
+ ]
+ },
+ "execution_count": 105,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# If we coerce it to a list, we see something interesting: \n",
"# It's basically a list of tuples! \n",
@@ -890,9 +2052,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 106,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Female\n",
+ "total_bill 18.056897\n",
+ "tip 2.833448\n",
+ "size 2.459770\n",
+ "dtype: float64\n",
+ "Male\n",
+ "total_bill 20.744076\n",
+ "tip 3.089618\n",
+ "size 2.630573\n",
+ "dtype: float64\n"
+ ]
+ }
+ ],
"source": [
"# We can iterate through the groupby just like we would a list of tuples!\n",
"\n",
@@ -921,28 +2100,83 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 113,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "sex\n",
+ "Female 44.30\n",
+ "Male 50.81\n",
+ "Name: total_bill, dtype: float64"
+ ]
+ },
+ "execution_count": 113,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Get the maximum bill by gender: \n",
"\n",
"def max_bill(df):\n",
" return df.total_bill.max()\n",
"\n",
- "tips.groupby(\"sex\").apply(max_bill)"
+ "tips.groupby(\"sex\").apply(max_bill)\n",
+ "\n",
+ "tips.groupby(\"sex\").total_bill.max()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 121,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "sex\n",
+ "Female 48.33\n",
+ "Male 48.33\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 121,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Challenge: \n",
"\n",
"# Get the second largest bill by gender!\n",
- "# HINT: use sort_values and iloc!"
+ "# HINT: use sort_values and iloc!\n",
+ "\n",
+ "def snd(df):\n",
+ " return tips.sort_values('total_bill').iloc[-2,0]\n",
+ "\n",
+ "tips.groupby('sex').apply(snd)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 119,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "50.81"
+ ]
+ },
+ "execution_count": 119,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tips.sort_values('total_bill').iloc[-1,0]"
]
},
{
@@ -961,9 +2195,74 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | day | \n",
+ " Fri | \n",
+ " Sat | \n",
+ " Sun | \n",
+ " Thur | \n",
+ "
\n",
+ " \n",
+ " | sex | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Female | \n",
+ " 2.781111 | \n",
+ " 2.801786 | \n",
+ " 3.367222 | \n",
+ " 2.575625 | \n",
+ "
\n",
+ " \n",
+ " | Male | \n",
+ " 2.693000 | \n",
+ " 3.083898 | \n",
+ " 3.220345 | \n",
+ " 2.980333 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "day Fri Sat Sun Thur\n",
+ "sex \n",
+ "Female 2.781111 2.801786 3.367222 2.575625\n",
+ "Male 2.693000 3.083898 3.220345 2.980333"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Challenge: \n",
"# What is the mean tip, per day, for male vs. female?\n",
@@ -972,7 +2271,7 @@
"def day_mean(df):\n",
" # Hint: you will need to group by \"day\"\n",
" # in this function, then get the mean tip. \n",
- " pass\n",
+ " return df.groupby('day').tip.mean()\n",
"\n",
"\n",
"tips.groupby(\"sex\").apply(day_mean)"
@@ -998,9 +2297,271 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[(('Female', 'Fri'), total_bill tip sex smoker day time size\n",
+ " 92 5.75 1.00 Female Yes Fri Dinner 2\n",
+ " 93 16.32 4.30 Female Yes Fri Dinner 2\n",
+ " 94 22.75 3.25 Female No Fri Dinner 2\n",
+ " 100 11.35 2.50 Female Yes Fri Dinner 2\n",
+ " 101 15.38 3.00 Female Yes Fri Dinner 2\n",
+ " 221 13.42 3.48 Female Yes Fri Lunch 2\n",
+ " 223 15.98 3.00 Female No Fri Lunch 3\n",
+ " 225 16.27 2.50 Female Yes Fri Lunch 2\n",
+ " 226 10.09 2.00 Female Yes Fri Lunch 2),\n",
+ " (('Female', 'Sat'), total_bill tip sex smoker day time size\n",
+ " 21 20.29 2.75 Female No Sat Dinner 2\n",
+ " 22 15.77 2.23 Female No Sat Dinner 2\n",
+ " 29 19.65 3.00 Female No Sat Dinner 2\n",
+ " 32 15.06 3.00 Female No Sat Dinner 2\n",
+ " 33 20.69 2.45 Female No Sat Dinner 4\n",
+ " 37 16.93 3.07 Female No Sat Dinner 3\n",
+ " 57 26.41 1.50 Female No Sat Dinner 2\n",
+ " 66 16.45 2.47 Female No Sat Dinner 2\n",
+ " 67 3.07 1.00 Female Yes Sat Dinner 1\n",
+ " 71 17.07 3.00 Female No Sat Dinner 3\n",
+ " 72 26.86 3.14 Female Yes Sat Dinner 2\n",
+ " 73 25.28 5.00 Female Yes Sat Dinner 2\n",
+ " 74 14.73 2.20 Female No Sat Dinner 2\n",
+ " 102 44.30 2.50 Female Yes Sat Dinner 3\n",
+ " 103 22.42 3.48 Female Yes Sat Dinner 2\n",
+ " 104 20.92 4.08 Female No Sat Dinner 2\n",
+ " 109 14.31 4.00 Female Yes Sat Dinner 2\n",
+ " 111 7.25 1.00 Female No Sat Dinner 1\n",
+ " 168 10.59 1.61 Female Yes Sat Dinner 2\n",
+ " 169 10.63 2.00 Female Yes Sat Dinner 2\n",
+ " 209 12.76 2.23 Female Yes Sat Dinner 2\n",
+ " 213 13.27 2.50 Female Yes Sat Dinner 2\n",
+ " 214 28.17 6.50 Female Yes Sat Dinner 3\n",
+ " 215 12.90 1.10 Female Yes Sat Dinner 2\n",
+ " 219 30.14 3.09 Female Yes Sat Dinner 4\n",
+ " 229 22.12 2.88 Female Yes Sat Dinner 2\n",
+ " 238 35.83 4.67 Female No Sat Dinner 3\n",
+ " 240 27.18 2.00 Female Yes Sat Dinner 2),\n",
+ " (('Female', 'Sun'), total_bill tip sex smoker day time size\n",
+ " 0 16.99 1.01 Female No Sun Dinner 2\n",
+ " 4 24.59 3.61 Female No Sun Dinner 4\n",
+ " 11 35.26 5.00 Female No Sun Dinner 4\n",
+ " 14 14.83 3.02 Female No Sun Dinner 2\n",
+ " 16 10.33 1.67 Female No Sun Dinner 3\n",
+ " 18 16.97 3.50 Female No Sun Dinner 3\n",
+ " 51 10.29 2.60 Female No Sun Dinner 2\n",
+ " 52 34.81 5.20 Female No Sun Dinner 4\n",
+ " 114 25.71 4.00 Female No Sun Dinner 3\n",
+ " 115 17.31 3.50 Female No Sun Dinner 2\n",
+ " 155 29.85 5.14 Female No Sun Dinner 5\n",
+ " 157 25.00 3.75 Female No Sun Dinner 4\n",
+ " 158 13.39 2.61 Female No Sun Dinner 2\n",
+ " 162 16.21 2.00 Female No Sun Dinner 3\n",
+ " 164 17.51 3.00 Female Yes Sun Dinner 2\n",
+ " 178 9.60 4.00 Female Yes Sun Dinner 2\n",
+ " 186 20.90 3.50 Female Yes Sun Dinner 3\n",
+ " 188 18.15 3.50 Female Yes Sun Dinner 3),\n",
+ " (('Female', 'Thur'), total_bill tip sex smoker day time size\n",
+ " 82 10.07 1.83 Female No Thur Lunch 1\n",
+ " 85 34.83 5.17 Female No Thur Lunch 4\n",
+ " 117 10.65 1.50 Female No Thur Lunch 2\n",
+ " 118 12.43 1.80 Female No Thur Lunch 2\n",
+ " 119 24.08 2.92 Female No Thur Lunch 4\n",
+ " 121 13.42 1.68 Female No Thur Lunch 2\n",
+ " 124 12.48 2.52 Female No Thur Lunch 2\n",
+ " 125 29.80 4.20 Female No Thur Lunch 6\n",
+ " 127 14.52 2.00 Female No Thur Lunch 2\n",
+ " 128 11.38 2.00 Female No Thur Lunch 2\n",
+ " 131 20.27 2.83 Female No Thur Lunch 2\n",
+ " 132 11.17 1.50 Female No Thur Lunch 2\n",
+ " 133 12.26 2.00 Female No Thur Lunch 2\n",
+ " 134 18.26 3.25 Female No Thur Lunch 2\n",
+ " 135 8.51 1.25 Female No Thur Lunch 2\n",
+ " 136 10.33 2.00 Female No Thur Lunch 2\n",
+ " 137 14.15 2.00 Female No Thur Lunch 2\n",
+ " 139 13.16 2.75 Female No Thur Lunch 2\n",
+ " 140 17.47 3.50 Female No Thur Lunch 2\n",
+ " 143 27.05 5.00 Female No Thur Lunch 6\n",
+ " 144 16.43 2.30 Female No Thur Lunch 2\n",
+ " 145 8.35 1.50 Female No Thur Lunch 2\n",
+ " 146 18.64 1.36 Female No Thur Lunch 3\n",
+ " 147 11.87 1.63 Female No Thur Lunch 2\n",
+ " 191 19.81 4.19 Female Yes Thur Lunch 2\n",
+ " 197 43.11 5.00 Female Yes Thur Lunch 4\n",
+ " 198 13.00 2.00 Female Yes Thur Lunch 2\n",
+ " 201 12.74 2.01 Female Yes Thur Lunch 2\n",
+ " 202 13.00 2.00 Female Yes Thur Lunch 2\n",
+ " 203 16.40 2.50 Female Yes Thur Lunch 2\n",
+ " 205 16.47 3.23 Female Yes Thur Lunch 3\n",
+ " 243 18.78 3.00 Female No Thur Dinner 2),\n",
+ " (('Male', 'Fri'), total_bill tip sex smoker day time size\n",
+ " 90 28.97 3.00 Male Yes Fri Dinner 2\n",
+ " 91 22.49 3.50 Male No Fri Dinner 2\n",
+ " 95 40.17 4.73 Male Yes Fri Dinner 4\n",
+ " 96 27.28 4.00 Male Yes Fri Dinner 2\n",
+ " 97 12.03 1.50 Male Yes Fri Dinner 2\n",
+ " 98 21.01 3.00 Male Yes Fri Dinner 2\n",
+ " 99 12.46 1.50 Male No Fri Dinner 2\n",
+ " 220 12.16 2.20 Male Yes Fri Lunch 2\n",
+ " 222 8.58 1.92 Male Yes Fri Lunch 1\n",
+ " 224 13.42 1.58 Male Yes Fri Lunch 2),\n",
+ " (('Male', 'Sat'), total_bill tip sex smoker day time size\n",
+ " 19 20.65 3.35 Male No Sat Dinner 3\n",
+ " 20 17.92 4.08 Male No Sat Dinner 2\n",
+ " 23 39.42 7.58 Male No Sat Dinner 4\n",
+ " 24 19.82 3.18 Male No Sat Dinner 2\n",
+ " 25 17.81 2.34 Male No Sat Dinner 4\n",
+ " 26 13.37 2.00 Male No Sat Dinner 2\n",
+ " 27 12.69 2.00 Male No Sat Dinner 2\n",
+ " 28 21.70 4.30 Male No Sat Dinner 2\n",
+ " 30 9.55 1.45 Male No Sat Dinner 2\n",
+ " 31 18.35 2.50 Male No Sat Dinner 4\n",
+ " 34 17.78 3.27 Male No Sat Dinner 2\n",
+ " 35 24.06 3.60 Male No Sat Dinner 3\n",
+ " 36 16.31 2.00 Male No Sat Dinner 3\n",
+ " 38 18.69 2.31 Male No Sat Dinner 3\n",
+ " 39 31.27 5.00 Male No Sat Dinner 3\n",
+ " 40 16.04 2.24 Male No Sat Dinner 3\n",
+ " 56 38.01 3.00 Male Yes Sat Dinner 4\n",
+ " 58 11.24 1.76 Male Yes Sat Dinner 2\n",
+ " 59 48.27 6.73 Male No Sat Dinner 4\n",
+ " 60 20.29 3.21 Male Yes Sat Dinner 2\n",
+ " 61 13.81 2.00 Male Yes Sat Dinner 2\n",
+ " 62 11.02 1.98 Male Yes Sat Dinner 2\n",
+ " 63 18.29 3.76 Male Yes Sat Dinner 4\n",
+ " 64 17.59 2.64 Male No Sat Dinner 3\n",
+ " 65 20.08 3.15 Male No Sat Dinner 3\n",
+ " 68 20.23 2.01 Male No Sat Dinner 2\n",
+ " 69 15.01 2.09 Male Yes Sat Dinner 2\n",
+ " 70 12.02 1.97 Male No Sat Dinner 2\n",
+ " 75 10.51 1.25 Male No Sat Dinner 2\n",
+ " 76 17.92 3.08 Male Yes Sat Dinner 2\n",
+ " 105 15.36 1.64 Male Yes Sat Dinner 2\n",
+ " 106 20.49 4.06 Male Yes Sat Dinner 2\n",
+ " 107 25.21 4.29 Male Yes Sat Dinner 2\n",
+ " 108 18.24 3.76 Male No Sat Dinner 2\n",
+ " 110 14.00 3.00 Male No Sat Dinner 2\n",
+ " 170 50.81 10.00 Male Yes Sat Dinner 3\n",
+ " 171 15.81 3.16 Male Yes Sat Dinner 2\n",
+ " 206 26.59 3.41 Male Yes Sat Dinner 3\n",
+ " 207 38.73 3.00 Male Yes Sat Dinner 4\n",
+ " 208 24.27 2.03 Male Yes Sat Dinner 2\n",
+ " 210 30.06 2.00 Male Yes Sat Dinner 3\n",
+ " 211 25.89 5.16 Male Yes Sat Dinner 4\n",
+ " 212 48.33 9.00 Male No Sat Dinner 4\n",
+ " 216 28.15 3.00 Male Yes Sat Dinner 5\n",
+ " 217 11.59 1.50 Male Yes Sat Dinner 2\n",
+ " 218 7.74 1.44 Male Yes Sat Dinner 2\n",
+ " 227 20.45 3.00 Male No Sat Dinner 4\n",
+ " 228 13.28 2.72 Male No Sat Dinner 2\n",
+ " 230 24.01 2.00 Male Yes Sat Dinner 4\n",
+ " 231 15.69 3.00 Male Yes Sat Dinner 3\n",
+ " 232 11.61 3.39 Male No Sat Dinner 2\n",
+ " 233 10.77 1.47 Male No Sat Dinner 2\n",
+ " 234 15.53 3.00 Male Yes Sat Dinner 2\n",
+ " 235 10.07 1.25 Male No Sat Dinner 2\n",
+ " 236 12.60 1.00 Male Yes Sat Dinner 2\n",
+ " 237 32.83 1.17 Male Yes Sat Dinner 2\n",
+ " 239 29.03 5.92 Male No Sat Dinner 3\n",
+ " 241 22.67 2.00 Male Yes Sat Dinner 2\n",
+ " 242 17.82 1.75 Male No Sat Dinner 2),\n",
+ " (('Male', 'Sun'), total_bill tip sex smoker day time size\n",
+ " 1 10.34 1.66 Male No Sun Dinner 3\n",
+ " 2 21.01 3.50 Male No Sun Dinner 3\n",
+ " 3 23.68 3.31 Male No Sun Dinner 2\n",
+ " 5 25.29 4.71 Male No Sun Dinner 4\n",
+ " 6 8.77 2.00 Male No Sun Dinner 2\n",
+ " 7 26.88 3.12 Male No Sun Dinner 4\n",
+ " 8 15.04 1.96 Male No Sun Dinner 2\n",
+ " 9 14.78 3.23 Male No Sun Dinner 2\n",
+ " 10 10.27 1.71 Male No Sun Dinner 2\n",
+ " 12 15.42 1.57 Male No Sun Dinner 2\n",
+ " 13 18.43 3.00 Male No Sun Dinner 4\n",
+ " 15 21.58 3.92 Male No Sun Dinner 2\n",
+ " 17 16.29 3.71 Male No Sun Dinner 3\n",
+ " 41 17.46 2.54 Male No Sun Dinner 2\n",
+ " 42 13.94 3.06 Male No Sun Dinner 2\n",
+ " 43 9.68 1.32 Male No Sun Dinner 2\n",
+ " 44 30.40 5.60 Male No Sun Dinner 4\n",
+ " 45 18.29 3.00 Male No Sun Dinner 2\n",
+ " 46 22.23 5.00 Male No Sun Dinner 2\n",
+ " 47 32.40 6.00 Male No Sun Dinner 4\n",
+ " 48 28.55 2.05 Male No Sun Dinner 3\n",
+ " 49 18.04 3.00 Male No Sun Dinner 2\n",
+ " 50 12.54 2.50 Male No Sun Dinner 2\n",
+ " 53 9.94 1.56 Male No Sun Dinner 2\n",
+ " 54 25.56 4.34 Male No Sun Dinner 4\n",
+ " 55 19.49 3.51 Male No Sun Dinner 2\n",
+ " 112 38.07 4.00 Male No Sun Dinner 3\n",
+ " 113 23.95 2.55 Male No Sun Dinner 2\n",
+ " 116 29.93 5.07 Male No Sun Dinner 4\n",
+ " 150 14.07 2.50 Male No Sun Dinner 2\n",
+ " 151 13.13 2.00 Male No Sun Dinner 2\n",
+ " 152 17.26 2.74 Male No Sun Dinner 3\n",
+ " 153 24.55 2.00 Male No Sun Dinner 4\n",
+ " 154 19.77 2.00 Male No Sun Dinner 4\n",
+ " 156 48.17 5.00 Male No Sun Dinner 6\n",
+ " 159 16.49 2.00 Male No Sun Dinner 4\n",
+ " 160 21.50 3.50 Male No Sun Dinner 4\n",
+ " 161 12.66 2.50 Male No Sun Dinner 2\n",
+ " 163 13.81 2.00 Male No Sun Dinner 2\n",
+ " 165 24.52 3.48 Male No Sun Dinner 3\n",
+ " 166 20.76 2.24 Male No Sun Dinner 2\n",
+ " 167 31.71 4.50 Male No Sun Dinner 4\n",
+ " 172 7.25 5.15 Male Yes Sun Dinner 2\n",
+ " 173 31.85 3.18 Male Yes Sun Dinner 2\n",
+ " 174 16.82 4.00 Male Yes Sun Dinner 2\n",
+ " 175 32.90 3.11 Male Yes Sun Dinner 2\n",
+ " 176 17.89 2.00 Male Yes Sun Dinner 2\n",
+ " 177 14.48 2.00 Male Yes Sun Dinner 2\n",
+ " 179 34.63 3.55 Male Yes Sun Dinner 2\n",
+ " 180 34.65 3.68 Male Yes Sun Dinner 4\n",
+ " 181 23.33 5.65 Male Yes Sun Dinner 2\n",
+ " 182 45.35 3.50 Male Yes Sun Dinner 3\n",
+ " 183 23.17 6.50 Male Yes Sun Dinner 4\n",
+ " 184 40.55 3.00 Male Yes Sun Dinner 2\n",
+ " 185 20.69 5.00 Male No Sun Dinner 5\n",
+ " 187 30.46 2.00 Male Yes Sun Dinner 5\n",
+ " 189 23.10 4.00 Male Yes Sun Dinner 3\n",
+ " 190 15.69 1.50 Male Yes Sun Dinner 2),\n",
+ " (('Male', 'Thur'), total_bill tip sex smoker day time size\n",
+ " 77 27.20 4.00 Male No Thur Lunch 4\n",
+ " 78 22.76 3.00 Male No Thur Lunch 2\n",
+ " 79 17.29 2.71 Male No Thur Lunch 2\n",
+ " 80 19.44 3.00 Male Yes Thur Lunch 2\n",
+ " 81 16.66 3.40 Male No Thur Lunch 2\n",
+ " 83 32.68 5.00 Male Yes Thur Lunch 2\n",
+ " 84 15.98 2.03 Male No Thur Lunch 2\n",
+ " 86 13.03 2.00 Male No Thur Lunch 2\n",
+ " 87 18.28 4.00 Male No Thur Lunch 2\n",
+ " 88 24.71 5.85 Male No Thur Lunch 2\n",
+ " 89 21.16 3.00 Male No Thur Lunch 2\n",
+ " 120 11.69 2.31 Male No Thur Lunch 2\n",
+ " 122 14.26 2.50 Male No Thur Lunch 2\n",
+ " 123 15.95 2.00 Male No Thur Lunch 2\n",
+ " 126 8.52 1.48 Male No Thur Lunch 2\n",
+ " 129 22.82 2.18 Male No Thur Lunch 3\n",
+ " 130 19.08 1.50 Male No Thur Lunch 2\n",
+ " 138 16.00 2.00 Male Yes Thur Lunch 2\n",
+ " 141 34.30 6.70 Male No Thur Lunch 6\n",
+ " 142 41.19 5.00 Male No Thur Lunch 5\n",
+ " 148 9.78 1.73 Male No Thur Lunch 2\n",
+ " 149 7.51 2.00 Male No Thur Lunch 2\n",
+ " 192 28.44 2.56 Male Yes Thur Lunch 2\n",
+ " 193 15.48 2.02 Male Yes Thur Lunch 2\n",
+ " 194 16.58 4.00 Male Yes Thur Lunch 2\n",
+ " 195 7.56 1.44 Male No Thur Lunch 2\n",
+ " 196 10.34 2.00 Male Yes Thur Lunch 2\n",
+ " 199 13.51 2.00 Male Yes Thur Lunch 2\n",
+ " 200 18.71 4.00 Male Yes Thur Lunch 3\n",
+ " 204 20.53 4.00 Male Yes Thur Lunch 4)]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Take a look at the structure of the multiple groupby!\n",
"\n",
@@ -1133,9 +2694,70 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " screenname | \n",
+ " id_str | \n",
+ " text | \n",
+ " hashtags | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " nandanrao | \n",
+ " 928374987 | \n",
+ " Woah, pandas is so much fun #worldrocked #jawd... | \n",
+ " [worldrocked, jawdrop, ml] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " om | \n",
+ " 98214039 | \n",
+ " I eat linear models for breakfast #datascience... | \n",
+ " [datascience, ml, crossfit] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " screenname id_str text \\\n",
+ "0 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd... \n",
+ "1 om 98214039 I eat linear models for breakfast #datascience... \n",
+ "\n",
+ " hashtags \n",
+ "0 [worldrocked, jawdrop, ml] \n",
+ "1 [datascience, ml, crossfit] "
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"raw_tweets = [{ \"screenname\": \"nandanrao\",\n",
" \"id_str\": \"928374987\",\n",
@@ -1165,9 +2787,63 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " screenname | \n",
+ " id_str | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " nandanrao | \n",
+ " 928374987 | \n",
+ " Woah, pandas is so much fun #worldrocked #jawd... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " om | \n",
+ " 98214039 | \n",
+ " I eat linear models for breakfast #datascience... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " screenname id_str text\n",
+ "0 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd...\n",
+ "1 om 98214039 I eat linear models for breakfast #datascience..."
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"tweets = pd.DataFrame(raw_tweets, columns = [\"screenname\", \"id_str\", \"text\"])\n",
"tweets"
@@ -1175,9 +2851,84 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id_str | \n",
+ " hashtag | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 928374987 | \n",
+ " worldrocked | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 928374987 | \n",
+ " jawdrop | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 928374987 | \n",
+ " ml | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 98214039 | \n",
+ " datascience | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 98214039 | \n",
+ " ml | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 98214039 | \n",
+ " crossfit | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id_str hashtag\n",
+ "0 928374987 worldrocked\n",
+ "1 928374987 jawdrop\n",
+ "2 928374987 ml\n",
+ "3 98214039 datascience\n",
+ "4 98214039 ml\n",
+ "5 98214039 crossfit"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"tags_and_ids = [(t['id_str'], tag) \n",
" for t in raw_tweets \n",
@@ -1190,11 +2941,108 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 23,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " screenname | \n",
+ " id_str | \n",
+ " text | \n",
+ " hashtag | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " nandanrao | \n",
+ " 928374987 | \n",
+ " Woah, pandas is so much fun #worldrocked #jawd... | \n",
+ " worldrocked | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " nandanrao | \n",
+ " 928374987 | \n",
+ " Woah, pandas is so much fun #worldrocked #jawd... | \n",
+ " jawdrop | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " nandanrao | \n",
+ " 928374987 | \n",
+ " Woah, pandas is so much fun #worldrocked #jawd... | \n",
+ " ml | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " om | \n",
+ " 98214039 | \n",
+ " I eat linear models for breakfast #datascience... | \n",
+ " datascience | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " om | \n",
+ " 98214039 | \n",
+ " I eat linear models for breakfast #datascience... | \n",
+ " ml | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " om | \n",
+ " 98214039 | \n",
+ " I eat linear models for breakfast #datascience... | \n",
+ " crossfit | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " screenname id_str text \\\n",
+ "0 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd... \n",
+ "1 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd... \n",
+ "2 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd... \n",
+ "3 om 98214039 I eat linear models for breakfast #datascience... \n",
+ "4 om 98214039 I eat linear models for breakfast #datascience... \n",
+ "5 om 98214039 I eat linear models for breakfast #datascience... \n",
+ "\n",
+ " hashtag \n",
+ "0 worldrocked \n",
+ "1 jawdrop \n",
+ "2 ml \n",
+ "3 datascience \n",
+ "4 ml \n",
+ "5 crossfit "
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "df = tweets.merge(hashtags, how='left')\n",
+ "df = tweets.merge(hashtags, how='inner')\n",
"\n",
"df"
]
@@ -1218,6 +3066,678 @@
"\n",
"*Needless to say that eyeballing is OK for making sure your code makes sense, but will not result in full credits for the project. We want a fully automated code. To carry out the project successfully you need to use most the attributes and methods described earlier. The last one is a little tricky*"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Import modules\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "#Open CSV files\n",
+ "df_p = pd.read_csv('supermarket_prices.csv')\n",
+ "df_t = pd.read_csv('supermarket_transactions.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Product | \n",
+ " Price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " tomato | \n",
+ " 2.1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " potato | \n",
+ " 3.4 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " apple | \n",
+ " 1.2 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " orange | \n",
+ " 4.3 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " banana | \n",
+ " 5.2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Product Price\n",
+ "0 tomato 2.1\n",
+ "1 potato 3.4\n",
+ "2 apple 1.2\n",
+ "3 orange 4.3\n",
+ "4 banana 5.2"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Buyer | \n",
+ " Product | \n",
+ " Quantity | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Jackson | \n",
+ " apple | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Jackson | \n",
+ " apple | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " John | \n",
+ " orange | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " John | \n",
+ " potato | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Tom | \n",
+ " tomato | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 94 | \n",
+ " Sophia | \n",
+ " apple | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | 95 | \n",
+ " Jackson | \n",
+ " potato | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | 96 | \n",
+ " Liam | \n",
+ " potato | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 97 | \n",
+ " Sophia | \n",
+ " potato | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 98 | \n",
+ " John | \n",
+ " orange | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
99 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Buyer Product Quantity\n",
+ "0 Jackson apple 4\n",
+ "1 Jackson apple 9\n",
+ "2 John orange 9\n",
+ "3 John potato 10\n",
+ "4 Tom tomato 4\n",
+ ".. ... ... ...\n",
+ "94 Sophia apple 7\n",
+ "95 Jackson potato 8\n",
+ "96 Liam potato 2\n",
+ "97 Sophia potato 6\n",
+ "98 John orange 10\n",
+ "\n",
+ "[99 rows x 3 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "display(df_p)\n",
+ "display(df_t)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Buyer\n",
+ "Emma 81\n",
+ "Jackson 70\n",
+ "John 122\n",
+ "Liam 81\n",
+ "Lucas 62\n",
+ "Sandra 78\n",
+ "Sophia 61\n",
+ "Tom 49\n",
+ "Name: Quantity, dtype: int64"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#How many items each client has purchased\n",
+ "df_t.groupby('Buyer').Quantity.sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Buyer Product\n",
+ "Emma apple 4\n",
+ " banana 4\n",
+ " potato 2\n",
+ " tomato 2\n",
+ "Jackson orange 5\n",
+ " apple 3\n",
+ " tomato 3\n",
+ " potato 1\n",
+ "John orange 7\n",
+ " banana 4\n",
+ " tomato 3\n",
+ " potato 2\n",
+ " apple 1\n",
+ "Liam banana 4\n",
+ " apple 3\n",
+ " potato 3\n",
+ " orange 2\n",
+ " tomato 1\n",
+ "Lucas orange 3\n",
+ " tomato 3\n",
+ " apple 2\n",
+ " potato 2\n",
+ " banana 1\n",
+ "Sandra orange 5\n",
+ " potato 4\n",
+ " banana 1\n",
+ " tomato 1\n",
+ "Sophia apple 3\n",
+ " banana 3\n",
+ " orange 2\n",
+ " potato 2\n",
+ " tomato 2\n",
+ "Tom apple 4\n",
+ " potato 3\n",
+ " tomato 3\n",
+ " banana 1\n",
+ "Name: Product, dtype: int64"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#How many items of each type each client has purchased\n",
+ "df_t.groupby('Buyer').Product.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Buyer | \n",
+ " Product | \n",
+ " Quantity | \n",
+ " Price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Jackson | \n",
+ " apple | \n",
+ " 4 | \n",
+ " 1.2 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Jackson | \n",
+ " apple | \n",
+ " 9 | \n",
+ " 1.2 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " John | \n",
+ " orange | \n",
+ " 9 | \n",
+ " 4.3 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " John | \n",
+ " potato | \n",
+ " 10 | \n",
+ " 3.4 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Tom | \n",
+ " tomato | \n",
+ " 4 | \n",
+ " 2.1 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 94 | \n",
+ " Sophia | \n",
+ " apple | \n",
+ " 7 | \n",
+ " 1.2 | \n",
+ "
\n",
+ " \n",
+ " | 95 | \n",
+ " Jackson | \n",
+ " potato | \n",
+ " 8 | \n",
+ " 3.4 | \n",
+ "
\n",
+ " \n",
+ " | 96 | \n",
+ " Liam | \n",
+ " potato | \n",
+ " 2 | \n",
+ " 3.4 | \n",
+ "
\n",
+ " \n",
+ " | 97 | \n",
+ " Sophia | \n",
+ " potato | \n",
+ " 6 | \n",
+ " 3.4 | \n",
+ "
\n",
+ " \n",
+ " | 98 | \n",
+ " John | \n",
+ " orange | \n",
+ " 10 | \n",
+ " 4.3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
99 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Buyer Product Quantity Price\n",
+ "0 Jackson apple 4 1.2\n",
+ "1 Jackson apple 9 1.2\n",
+ "2 John orange 9 4.3\n",
+ "3 John potato 10 3.4\n",
+ "4 Tom tomato 4 2.1\n",
+ ".. ... ... ... ...\n",
+ "94 Sophia apple 7 1.2\n",
+ "95 Jackson potato 8 3.4\n",
+ "96 Liam potato 2 3.4\n",
+ "97 Sophia potato 6 3.4\n",
+ "98 John orange 10 4.3\n",
+ "\n",
+ "[99 rows x 4 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "#Calculate the total amount spent by each client\n",
+ "\n",
+ "#To do this I need to start by joining the two datasets I have \n",
+ "\n",
+ "df = df_t.merge(df_p, how = 'left' ,on = 'Product')\n",
+ "display(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Buyer | \n",
+ " Product | \n",
+ " Quantity | \n",
+ " Price | \n",
+ " Spending | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Jackson | \n",
+ " apple | \n",
+ " 4 | \n",
+ " 1.2 | \n",
+ " 4.8 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Jackson | \n",
+ " apple | \n",
+ " 9 | \n",
+ " 1.2 | \n",
+ " 10.8 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " John | \n",
+ " orange | \n",
+ " 9 | \n",
+ " 4.3 | \n",
+ " 38.7 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " John | \n",
+ " potato | \n",
+ " 10 | \n",
+ " 3.4 | \n",
+ " 34.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Tom | \n",
+ " tomato | \n",
+ " 4 | \n",
+ " 2.1 | \n",
+ " 8.4 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 94 | \n",
+ " Sophia | \n",
+ " apple | \n",
+ " 7 | \n",
+ " 1.2 | \n",
+ " 8.4 | \n",
+ "
\n",
+ " \n",
+ " | 95 | \n",
+ " Jackson | \n",
+ " potato | \n",
+ " 8 | \n",
+ " 3.4 | \n",
+ " 27.2 | \n",
+ "
\n",
+ " \n",
+ " | 96 | \n",
+ " Liam | \n",
+ " potato | \n",
+ " 2 | \n",
+ " 3.4 | \n",
+ " 6.8 | \n",
+ "
\n",
+ " \n",
+ " | 97 | \n",
+ " Sophia | \n",
+ " potato | \n",
+ " 6 | \n",
+ " 3.4 | \n",
+ " 20.4 | \n",
+ "
\n",
+ " \n",
+ " | 98 | \n",
+ " John | \n",
+ " orange | \n",
+ " 10 | \n",
+ " 4.3 | \n",
+ " 43.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
99 rows × 5 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Buyer Product Quantity Price Spending\n",
+ "0 Jackson apple 4 1.2 4.8\n",
+ "1 Jackson apple 9 1.2 10.8\n",
+ "2 John orange 9 4.3 38.7\n",
+ "3 John potato 10 3.4 34.0\n",
+ "4 Tom tomato 4 2.1 8.4\n",
+ ".. ... ... ... ... ...\n",
+ "94 Sophia apple 7 1.2 8.4\n",
+ "95 Jackson potato 8 3.4 27.2\n",
+ "96 Liam potato 2 3.4 6.8\n",
+ "97 Sophia potato 6 3.4 20.4\n",
+ "98 John orange 10 4.3 43.0\n",
+ "\n",
+ "[99 rows x 5 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "#Calculate the total amount spent by each client\n",
+ "df['Spending'] = df.Quantity*df.Price\n",
+ "display(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Buyer\n",
+ "Emma 246.4\n",
+ "Jackson 202.8\n",
+ "John 461.3\n",
+ "Liam 263.3\n",
+ "Lucas 176.0\n",
+ "Sandra 300.8\n",
+ "Sophia 189.4\n",
+ "Tom 126.1\n",
+ "Name: Spending, dtype: float64"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Calculate the total amount spent by each client\n",
+ "df.groupby('Buyer').Spending.sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Buyer\n",
+ "Emma 135.2\n",
+ "John 145.6\n",
+ "Liam 83.2\n",
+ "Lucas 15.6\n",
+ "Sandra 10.4\n",
+ "Sophia 67.6\n",
+ "Tom 31.2\n",
+ "Name: Spending, dtype: float64"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Part 1:\n",
+ "# The company that provides the supermarket with bananas wishes to give a prize to the client that \n",
+ "# has spent the largest proportion of their spending on bananas. Who should win the prize?\n",
+ "\n",
+ "df[df.Product == 'banana'].groupby('Buyer').Spending.sum()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Part 2:\n",
+ "# A marketing company that works with the supermarket is interested to understand better \n",
+ "# the characteristics of the three people that have spent most of their spending on bananas. \n",
+ "# For each one of them report the other product that they have spent most of their remaining income on"
+ ]
}
],
"metadata": {
@@ -1237,7 +3757,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.3"
+ "version": "3.7.3"
}
},
"nbformat": 4,