from datascience import *
import numpy as np
import seaborn as sns
Table.interactive_plots()
tips = Table.from_df(sns.load_dataset('tips'))
tips
tips.group('time').barh('time')
tips.group('day', np.mean) \
.select('day', 'total_bill mean', 'tip mean') \
.take(3, 0, 1, 2)
tips.group('day', np.mean) \
.select('day', 'total_bill mean', 'tip mean') \
.take(3, 0, 1, 2) \
.barh('day', xaxis_title = 'Dollars',
yaxis_title = 'Day of the Week',
title = 'Tips and Total Bills on Various Days of the Week')
tips
tips.hist('tip', density = False)
where
and are.between
¶tips.where('tip', are.between(1.9, 2.8))
tips.where('tip', are.between(1.9, 2.8)).num_rows
tips.where('tip', are.between(4.6, 5.5)).num_rows
density = False
?¶Look at the histogram that results if we don't set density = False
.
tips.hist('tip')
This is a perfectly valid histogram too, but it's not one that we will study in this class.
numbers = Table().with_columns(
'Height', np.array([72, 61, 63, 74, 68, 67, 65, 73, 65, 62, 66, 69, 75, 61, 61, 61, 65, 60, 64])
)
numbers.hist('Height', density = False, bins = [60, 64, 68, 72, 76])
We can use the same customization arguments with hist
as we did with barh
.
tips.hist('tip', density = False,
xaxis_title = 'Tip (Dollars)',
title = 'Distribution of Tips',
width = 600, height = 600)
tips.hist('tip',
density = False,
bins = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]))
np.arange
, revisited¶tips.hist('tip',
density = False,
bins = np.arange(12))
Let's look at another column.
tips.hist('total_bill', density = False)
Before setting bins, it's a good idea to look at the smallest and largest values in the column.
tips.column('total_bill').min()
tips.column('total_bill').max()
bins_3 = np.arange(3, 54, 3)
bins_3
tips.hist('total_bill',
density = False,
bins = bins_3,
title = 'Distribution of Total Bills, Bin Width = 3',
width = 600, height = 400)
bins_7 = np.arange(3, 53, 7)
bins_7
tips.hist('total_bill',
density = False,
bins = bins_7,
title = 'Distribution of Total Bills, Bin Width = 7',
width = 600, height = 400)
bins_10 = np.arange(3, 63, 10)
bins_10
tips.hist('total_bill',
density = False,
bins = bins_10,
title = 'Distribution of Total Bills, Bin Width = 10',
width = 600, height = 400)
tips
One category is 'time'
– we can make separate histograms for every unique value in 'time'
. As a reminder, there are two unique times, 'Lunch'
and 'Dinner'
, so we should expect to see two histograms.
tips.hist('total_bill', density = False, group = 'time')
tips.hist('total_bill', density = False, group = 'time', bins = bins_3)
If we want these on separate axes:
tips.hist('total_bill', density = False, group = 'time', overlay = False, width = 700, height = 500)
Note that for whatever reason, using group
, overlay
, and bins
with an array all at the same time doesn't work. (I've raised the issue with the folks who maintain the datascience
module.)
We could separate by other columns, like 'day'
.
tips
tips.hist('total_bill', density = False, group = 'day', width = 700, height = 400)
There's too much going on there – but you can click the legend to hide certain days.
# Tip proportion
tip_pct = 100 * tips.column('tip') / tips.column('total_bill')
tips = tips.with_columns(
'tip percentage', tip_pct
)
tips
# tips.where('tip percentage', are.below(25)) \
# .hist('tip percentage',
# density = False,
# ...)
Run the following cell.
tips.hist?