from datascience import *
import numpy as np
Table.interactive_plots()
import plotly.express as px
sky = Table.read_table('data/skyscrapers.csv') \
.where('status.current', are.contained_in(['completed', 'under construction'])) \
.select('name', 'location.city', 'location.latitude', 'location.longitude',
'statistics.floors above', 'statistics.height', 'status.completed.year') \
.relabeled(['location.city', 'location.latitude', 'location.longitude',
'statistics.floors above', 'statistics.height', 'status.completed.year'],
['city', 'latitude', 'longitude', 'floors', 'height', 'year']) \
.where('height', are.above(0)) \
.where('floors', are.above(0))
sky
sky.group('city') \
.where('count', are.above_or_equal_to(40)) \
.sort('count', descending = True) \
.barh('city', title = 'Number of Skyscrapers Per City')
# Remember, you're not responsible for the code here.
px.pie(sky.group('city').where('count', are.above_or_equal_to(40)).to_df(),
values = 'count',
names = 'city',
title = 'Number of Skyscrapers Per City (Top 10 Only)'
)
sky.shuffle()
sky.group('city') \
.where('count', are.above_or_equal_to(20)) \
.sort('count', descending = True)
sky.group('city') \
.where('count', are.above_or_equal_to(20)) \
.sort('count', descending = True) \
.barh('city', title = 'Number of Skyscrapers Per City (Min. 20)')
Do any of the above cities stick out to you?
sky.column('height').min()
sky.column('height').max()
sky.hist('height', density = False, bins = np.arange(0, 600, 25),
title = 'Distribution of Skyscraper Heights')
Let's zoom in a little more.
sky.where('height', are.below(300)) \
.hist('height', density = False, bins = np.arange(0, 310, 10),
title = 'Distribution of Skyscraper Heights Below 300m')
sky
Let's say a skyscraper is "short" if its height is less than or equal to 150 meters; otherwise, it's "tall".
def height_cat(height):
if height <= 150:
return 'short'
return 'tall'
sky.apply(height_cat, 'height')
sky = sky.with_columns('height category', sky.apply(height_cat, 'height'))
sky
We can use pivot
to draw a bar chart of the number of short and tall skyscrapers per city.
Fill in the blanks to create the table short_and_tall
, which has two columns, 'short'
and 'tall'
, and one row for each city with at least 5 short and 5 tall skyscrapers. The first five rows of short_and_tall
are shown below.
city | short | tall |
---|---|---|
New York City | 341 | 217 |
Chicago | 268 | 108 |
Miami | 58 | 49 |
Houston | 34 | 27 |
San Francisco | 43 | 22 |
short_and_tall = sky.pivot(__(a)__, __(b)__) \
.where(__(c)__, are.above_or_equal_to(5)) \
.where('tall', are.above_or_equal_to(5)) \
.sort('tall', descending = True)
# short_and_tall = sky.pivot(__(a)__, __(b)__) \
# .where(__(c)__, are.above_or_equal_to(5)) \
# .where('tall', are.above_or_equal_to(5)) \
# .sort('tall', descending = True)
# short_and_tall.barh('city', title = 'Number of Short and Tall Skyscrapers Per City (Min. 5 Each)')
It seems like most cities have roughly twice as many "short" skyscrapers as they do "tall" skyscrapers.
What if we want to look at the distribution of the number of floors per skyscraper, separated by height category?
sky.hist('floors', group = 'height category',
density = False,
bins = np.arange(0, 150, 5),
title = 'Distribution of Number of Floors Per Skyscraper')
Since there is overlap between the two histograms, we have that there are some short skyscrapers (below 150m) with more floors than some tall skyscrapers!
sky
sky.scatter('height', 'floors',
s = 30,
group = 'height category',
title = 'Number of Floors vs. Height',
yaxis_title = 'Number of Floors')
sky.where('height', are.above(300)) \
.scatter('height', 'floors',
s = 50,
labels = 'name',
title = 'Number of Floors vs. Height (Min. 300m)')
sky
sky.group('year')
This is obviously an error in our data.
sky.where('year', 0)
sky.where('year', are.not_equal_to(0)) \
.group('year') \
.plot('year', title = 'Number of Skyscrapers Built Per Year')
What if we want to look at the number of skyscrapers per year built in different cities?
sky.where('city', are.contained_in(['New York City', 'Chicago'])) \
.where('year', are.not_equal_to(0)) \
.pivot('city', 'year')
sky.where('city', are.contained_in(['New York City', 'Chicago'])) \
.where('year', are.not_equal_to(0)) \
.pivot('city', 'year') \
.plot('year',
title = 'Number of Skyscrapers Built Per Year in NYC and Chicago')
sky
Circle.map_table(sky.select('latitude', 'longitude'),
line_color = None,
fill_opacity = 0.65,
area = 75,
color = 'orange')
Let's look at a map of tall skyscrapers in New York City.
ny_tall = sky.where('city', 'New York City') \
.where('height category', 'tall') \
.select('latitude', 'longitude', 'name', 'height') \
.relabeled(['name', 'height'], ['labels', 'color_scale'])
ny_tall
Circle.map_table(ny_tall,
line_color = None,
fill_opacity = 0.65,
area = 150,
color_scale = None)
It seems like most skyscrapers in NYC are either in the financial district or in Midtown. The circles for One World Trade Center and the Empire State Building are bright.
Lastly, what if we want to look at where short and tall skyscrapers are throughout the country?
sky
There are two solutions here.
'short'
or 'tall'
and returns the desired color. (We did this in Lecture 28.)'short'
and 'tall'
and the other with the desired colors, and join this table with sky
.We will use the second approach here.
sky_to_color = Table().with_columns(
'category', np.array(['short', 'tall']),
'colors', np.array(['orange', 'green'])
)
sky_to_color
sky_with_colors = sky.join('height category', sky_to_color, 'category') \
.select('latitude', 'longitude', 'colors')
sky_with_colors
Circle.map_table(sky_with_colors,
line_color = None,
fill_opacity = 0.7)
While there seem to be short skyscrapers (orange) throughout the country, tall skyscrapers generally seem to be concentrated in larger cities.