from datascience import *
import numpy as np
Table.interactive_plots()
import plotly.express as px
sky = Table.read_table('data/skyscrapers.csv') \
.where('status.current', are.contained_in(['completed', 'under construction'])) \
.select('name', 'location.city', 'location.latitude', 'location.longitude',
'statistics.floors above', 'statistics.height', 'status.completed.year') \
.relabeled(['location.city', 'location.latitude', 'location.longitude',
'statistics.floors above', 'statistics.height', 'status.completed.year'],
['city', 'latitude', 'longitude', 'floors', 'height', 'year']) \
.where('height', are.above(0)) \
.where('floors', are.above(0))
sky
sky.group('city') \
.where('count', are.above_or_equal_to(40)) \
.sort('count', descending = True) \
.barh('city', title = 'Number of Skyscrapers Per City')
# Remember, you're not responsible for the code here.
px.pie(sky.group('city').where('count', are.above_or_equal_to(40)).to_df(),
values = 'count',
names = 'city',
title = 'Number of Skyscrapers Per City (Top 10 Only)'
)
sky.shuffle()
sky.group('city') \
.where('count', are.above_or_equal_to(20)) \
.sort('count', descending = True)
sky.group('city') \
.where('count', are.above_or_equal_to(20)) \
.sort('count', descending = True) \
.barh('city', title = 'Number of Skyscrapers Per City (Min. 20)')
Do any of the above cities stick out to you?
sky.column('height').min()
sky.column('height').max()
sky.hist('height', density = False, bins = np.arange(0, 600, 25),
title = 'Distribution of Skyscraper Heights')
Let's zoom in a little more.
sky.where('height', are.below(300)) \
.hist('height', density = False, bins = np.arange(0, 310, 10),
title = 'Distribution of Skyscraper Heights Below 300m')
sky
Let's say a skyscraper is "short" if its height is less than or equal to 150 meters; otherwise, it's "tall".
def height_cat(height):
if height <= 150:
return 'short'
return 'tall'
sky.apply(height_cat, 'height')
sky = sky.with_columns('height category', sky.apply(height_cat, 'height'))
sky
We can use pivot to draw a bar chart of the number of short and tall skyscrapers per city.
Fill in the blanks to create the table short_and_tall, which has two columns, 'short' and 'tall', and one row for each city with at least 5 short and 5 tall skyscrapers. The first five rows of short_and_tall are shown below.
| city | short | tall |
|---|---|---|
| New York City | 341 | 217 |
| Chicago | 268 | 108 |
| Miami | 58 | 49 |
| Houston | 34 | 27 |
| San Francisco | 43 | 22 |
short_and_tall = sky.pivot(__(a)__, __(b)__) \
.where(__(c)__, are.above_or_equal_to(5)) \
.where('tall', are.above_or_equal_to(5)) \
.sort('tall', descending = True)
# short_and_tall = sky.pivot(__(a)__, __(b)__) \
# .where(__(c)__, are.above_or_equal_to(5)) \
# .where('tall', are.above_or_equal_to(5)) \
# .sort('tall', descending = True)
# short_and_tall.barh('city', title = 'Number of Short and Tall Skyscrapers Per City (Min. 5 Each)')
It seems like most cities have roughly twice as many "short" skyscrapers as they do "tall" skyscrapers.
What if we want to look at the distribution of the number of floors per skyscraper, separated by height category?
sky.hist('floors', group = 'height category',
density = False,
bins = np.arange(0, 150, 5),
title = 'Distribution of Number of Floors Per Skyscraper')
Since there is overlap between the two histograms, we have that there are some short skyscrapers (below 150m) with more floors than some tall skyscrapers!
sky
sky.scatter('height', 'floors',
s = 30,
group = 'height category',
title = 'Number of Floors vs. Height',
yaxis_title = 'Number of Floors')
sky.where('height', are.above(300)) \
.scatter('height', 'floors',
s = 50,
labels = 'name',
title = 'Number of Floors vs. Height (Min. 300m)')
sky
sky.group('year')
This is obviously an error in our data.
sky.where('year', 0)
sky.where('year', are.not_equal_to(0)) \
.group('year') \
.plot('year', title = 'Number of Skyscrapers Built Per Year')
What if we want to look at the number of skyscrapers per year built in different cities?
sky.where('city', are.contained_in(['New York City', 'Chicago'])) \
.where('year', are.not_equal_to(0)) \
.pivot('city', 'year')
sky.where('city', are.contained_in(['New York City', 'Chicago'])) \
.where('year', are.not_equal_to(0)) \
.pivot('city', 'year') \
.plot('year',
title = 'Number of Skyscrapers Built Per Year in NYC and Chicago')
sky
Circle.map_table(sky.select('latitude', 'longitude'),
line_color = None,
fill_opacity = 0.65,
area = 75,
color = 'orange')