Sei sulla pagina 1di 79

library(dplyr)

library(ggplot2)
library(trelliscopejs)
library(tidyr)
library(purrr)

# Create the plot # Set the scales # Specify automatic cognistics


ggplot(gapminder, aes(year, lifeExp)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
facet_trelliscope(~ country + continent,
name = "lifeExp_by_country",
desc = "Life expectancy vs. year for 142 countries.",
nrow = 1, ncol = 2,
scales = 'sliced',
auto_cog = T)

library(gapminder)
space_to_dash <- function(x) gsub(" ", "-", x)

# Group by country and create the two new variables


gap <- gapminder %>%
group_by(country) %>%
mutate( delta_lifeExp = tail(lifeExp, 1) - head(lifeExp, 1),
ihme_link = paste0("http://www.healthdata.org/",
space_to_dash(country)))

# Add the description


gap$delta_lifeExp <- cog(gap$delta_lifeExp, desc = "Overall change in life
expectancy")
# Specify the default label
gap$ihme_link <- cog(gap$ihme_link, default_label = T)

ggplot(gap, aes(year, lifeExp)) +


geom_point() +
facet_trelliscope(~ country + continent,
name = "lifeExp_by_country",
desc = "Life expectancy vs. year.",
nrow = 1, ncol = 2,
scales = c("same", "sliced"))

# Nest stocks by symbol


by_symbol <- stocks %>% group_by(symbol) %>% tidyr::nest()
min_volume_fn <- function(x) min(x$volume)

# Create new column


by_symbol_min <- by_symbol %>%
mutate(min_volume = map_dbl(data, min_volume_fn))

library(plotly)

ohlc_plot <- function(d) {


plot_ly(d, x = ~date, type = "ohlc",
open = ~open, close = ~close, high = ~high, low = ~low)
}

by_symbol_plot <- by_symbol %>%


mutate(panel = map_plot(data, ohlc_plot))
by_symbol_plot <- mutate(by_symbol,
panel = map_plot(data, ohlc_plot))

Now that we have a data frame nested by stock symbol, let's make a Trelliscope
display with a plot for each stock. In this exercise, we use just the first 10
stocks in the by_symbol dataset we created in the previous exercise and create an
"open-high-low-close" plot, a plot similar to candlestick plots. The available
by_symbol dataset available in your session has already been subsetted to the first
10 stocks.
Question: Examine the by_symbol_plot output you just created to see what the plot
variables look like. The panel column contains a plotly object.

trelliscope(by_symbol_plot, name = "ohlc_top500")

# Create market_cap_log
by_symbol <- mutate(by_symbol,
market_cap_log = cog(
val = log10(market_cap), desc = "log base 10 market capitalization"
)
)

annual_return <- function(x)


100 * (tail(x$close, 1) - head(x$open, 1)) / head(x$open, 1)

# Compute by_symbol_avg
by_symbol_avg <- mutate(by_symbol,
stats = map(data, function(x) {
data_frame(
mean_close = mean(x$close),
mean_volume = mean(x$volume),
annual_return = annual_return(x)
)
}))
## call map not map_dbl

library(trelliscopejs)

# Create the trelliscope display


p <- trelliscope(by_symbol, name = 'ohlc_top500', desc = 'ohoohoh', width = 600,
height = 300)

pokemon %>%
# Reduce the variables in the dataset # Respecify pokemon # Create panel var
select(pokemon, type_1, attack, generation_id, url_image) %>%
mutate( pokemon = cog(pokemon, default_label = T),
panel = img_panel(url_image)
) %>%
### trelliscope(pokemon, name = 'pokemon', nrow = 3, ncol = 6) ### error
trelliscope(name = "pokemon", nrow = 3, ncol = 6)

# Compute daily_may
daily_may <- bike %>%
filter(start_mon == 5 ) %>%
group_by(start_day, start_hod, membership) %>%
summarise(n = n())
# Plot the result
ggplot(daily_may, aes(start_hod, n, color = membership)) +
geom_point() +
facet_wrap( ~ start_day, ncol = 7)

ggplot(daily_may, aes(start_hod, n, color = membership)) +


geom_point() +
# Facet on start_day
facet_trelliscope(~start_day, nrow = 3, ncol = 7)

# Function to construct a Google maps URL with cycling directions


make_gmap_url <- function(start_lat, start_lon, end_lat, end_lon) {
paste0("https://www.google.com/maps/dir/?api=1",
"&origin=", start_lat, ",", start_lon,
"&destination=", end_lat, ",", end_lon,
"&travelmode=bicycling")
}

# Compute tot_rides, weekday_diff, and map_url


route_hod_updated <- route_hod %>%
group_by(start_station_code, end_station_code) %>%
mutate(
tot_rides = sum(n) ,
weekday_diff = mean(n[weekday == "workweek"]) - mean(n[weekday == 'weekend'] )
,
map_url = make_gmap_url(start_lat, start_lon, end_lat, end_lon))

Visualizing the Data: Counts by Hour-of-Day


Create Trelliscope using ggplot2: n_of_rides vs. hour of day, colored by
workweek/weekend, and faceted by route using facet_trelliscope().
Plot point start_hod on x-axis and n on y-axis and color by weekday. Specify size =
3 in your geom. Facet by start and end station name, and set initial layout to be 2
rows and 4 columns. Remove the legend adding theme(legend.position = "none") to
your ggplot2 specification.

# Create the plot


ggplot(route_hod, aes(start_hod, n, col = weekday)) +
geom_point(size = 3) +
facet_trelliscope(~ start_station_name + end_station_name, nrow = 2, ncol = 4) +
theme(legend.position = 'none')

### rbokeh
# Filter gapminder data by year 1982
dat_1982 <- gapminder %>% filter(year == 1982)

# Plot life expectancy Vs GDP per Capita using data_1982


figure(legend_location = "bottom_right", title = "Life Expectancy Vs. GDP per
Capita in 1982") %>%
ly_points(x = gdpPercap, y = lifeExp, data = dat_1982, color = continent, hover =
c(continent, country, pop))

# Plot life expectancy Vs GDP per Capita using data_africa


figure(legend_location = "bottom_right",
title = "Life Expectancy Vs. GDP per Capita in Africa - 1967") %>%
ly_points(x = gdpPercap, y = lifeExp, data = data_africa, hover = c(country,
pop))

# Plot lifeExp Vs. gdpPercap


plot_1992 <- figure(legend_location = "bottom_right") %>%
ly_points(x = gdpPercap, y = lifeExp, col = continent, data = dat_1992)

# Extract the records of India and China from gapminder


data_countries <- filter(gapminder, country %in% c('China', 'India') )
fig_countries <- figure() %>%
ly_lines(x = year, y = lifeExp, col = country, data = data_countries)
# Modify: add a points layer with lifeExp vs. year
fig_countries %>%
ly_points(x = year, y = lifeExp, col = country, data = data_countries)

## use a custom palette with colors "green", "red"


fig_col %>%
set_palette(discrete_color = pal_color(c("green", "red")))

## define custom palette


custom_pal <- pal_color(c("#c51b8a", "#31a354"))

## use custom_pal to modify fig_col


fig_col %>% set_palette(custom_pal)

## explore bechdel dataset using str


str(bechdel)

## extract entries between 1980 - 2013


dat_80_13 <- bechdel %>% filter(between(year, 1980, 2013))
%>% mutate(roi_total =
intgross_2013/budget_2013)

## plot
figure(dat_80_13) %>%
ly_points(x = log(budget_2013), y = log(roi_total))

## filter the data by country = Syrian Arab Republic


hdi_countries <- hdi_data %>%
filter(country %in% c('Syrian Arab Republic', 'Morocco') )

## change the color and line width


figure(title = "Human Development Index over Time",
legend = "bottom_right") %>%
ly_lines( x = year, y = human_development_index, color = country, width = 3, data
= hdi_countries)

# explore hdi_cpi_data dataset


str(hdi_cpi_2015)

## add multiple values as hover info (country, cpi_rank)


figure(legend_location = "bottom_right") %>%
ly_points( x = corruption_perception_index, y = human_development_index,
color = continent, size = 6, hover = c(country, cpi_rank), data =
hdi_cpi_2015 )

## modify the figure theme


figure(title = "Corruption Perception Index Vs. Human Development Index 2015",
legend_location = "bottom_right",
xgrid = F, ygrid = F, xlab = "CPI", ylab = "HDI",
theme = bk_ggplot_theme() ) %>%
ly_points(x = corruption_perception_index, y = human_development_index, data =
hdi_cpi_2015,
color = continent, size = 6, hover = c(country, cpi_rank))
## plot corruption_perception_index versus human_development_index
figure(legend_location = "top_left") %>%
ly_points(x = human_development_index, y = corruption_perception_index, color =
continent, alpha = .7, data = hdi_cpi_data_wide,
hover = c(country, cpi_rank, corruption_perception_index,
human_development_index))

## convert from wide to long


hdi_data_long <- hdi_data_wide %>%
gather(key = year, value = human_development_index , -country)

## display the first 5 rows of hdi_data_long


head(hdi_data_long, 5)

## create a scatter plot with log(budget_2013) Vs log(intgross_2013)


p_scatter <- figure() %>% ly_points( x = log(budget_2013), y = log(intgross_2013),
size = 4, alpha = .5, data = dat_90_13_wide)
## fit a linear reg model
lin_reg <- lm(log(intgross_2013) ~ log(budget_2013), data = dat_90_13)
## add the linear regression line layer to p_scatter
p_scatter %>% ly_abline(lin_reg)

## extract entries for year 2007


dat_2007 <- gapminder %>%
filter(year == 2007)

## create scatter plot


figure(toolbar_location = 'above', legend_location = 'bottom_right' ) %>%
ly_points(x = gdpPercap, y = lifeExp, color = continent, size = 6, alpha = .7,
hover = c(country, lifeExp, gdpPercap), data = dat_2007 )

figure(legend_location = "bottom_right", tools = c("resize", "save") ) %>%


ly_points(x = gdpPercap, y = lifeExp, data = dat_2002, color = continent)

figure(legend_location = "bottom_right", toolbar_location = NULL ) %>%


ly_points(x = gdpPercap, y = lifeExp, data = dat_2002, color = continent)

# Create a bar plot for age group tb_2534 with % on the y-axis
bar_2534_percent <- figure(ylab = "share") %>%
ly_bar(x = year, y = count, tb_2534, color = gender, hover = T, position =
"fill")
# View figure
bar_2534_percent

# Create a list with bar_2534 and bar_2534_percent figures


fig_list <- list(bar_2534 = bar_2534, bar_2534_percent = bar_2534_percent)

# Create a grid plot


grid_plot(fig_list, w = 1000, h = 500)

## create a grid plot with same axes limits


grid_plot(figs = fig_list, w = 1000, h = 400, same_axes = T)
grid_plot(fig_list, ncol = 2, same_axes = T, simplify_axes = F)

## create two dataframes for female/male data


tb_female <- tb %>% filter(gender == 'f')
tb_male <- tb %>% filter(gender == 'm')

## create two plots using plot_line


fig_female <- plot_line(tb_female)
fig_male <- plot_line(tb_male)

## create figure list


fig_list <- list(female = fig_female, male = fig_male)

## plot the two figures in a grid


grid_plot(fig_list, w = 1000, h = 600, same_axes = T)

plot_line <- function(x){figure() %>% ly_lines(y = count, year, data = x, color =


age, alpha = 1, width = 2)}

## split tb data by gender


tb_split_gender <- split(tb, tb$gender)
## create a list of figures using lapply
fig_list <- lapply(tb_split_gender, plot_line)
## create a grid plot
grid_plot(fig_list, w = 1000, h = 600, same_axes = T)

## define a function to create a bar plot with the number of tb cases over time
plot_bar <- function(x){
figure() %>% ly_bar( x = year, y = count, color = gender, position = 'dodge',
data = x, hover = T)
}

## apply the function to the groups in tb_split_age


fig_list <- lapply(tb_split_age, plot_bar)
## create a grid plot
grid_plot(fig_list, w = 600, h = 900, nrow = 3, same_axes = T) %>%
theme_axis("x", major_label_orientation = 90)

## filter ny_bikedata to get the entries for day "2017-04-25"


ny_bikedata_20170425 <- ny_bikedata %>% filter(date = "2017-04-25")

## add a points layer to ny_map


## filter ny_bikedata to get the entries for day "2017-04-25"
ny_bikedata_20170425 <- ny_bikedata %>% filter(trip_date == "2017-04-25")

## add a points layer to ny_map


ny_map %>%
ly_points( x = station_longitude, y = station_latitude,
size = 8, fill_color = start_count, line_alpha = 0,
hover = c(station_name, start_count, end_count), data = ny_bikedata_20170425)

### PLOTLY

# Create a histogram of Critic_Score


vgsales %>%
plot_ly(x = ~Critic_Score) %>%
add_histogram()

## add_histogram(nbinsx = 25)
## add_histogram(xbins = list(start = 0, end = 100, size = 10))
# Create a frequency for Genre
genre_table <- vgsales %>%
count(Genre)

# Reorder the bars for Genre by n


genre_table %>%
mutate(Genre = fct_reorder(Genre, n, .desc = T)) %>%
plot_ly(x = ~Genre, y = ~n) %>%
add_bars()

# Create a scatter plot of User_Score against Critic_Score


vgsales %>%
plot_ly(x = ~Critic_Score, y = ~User_Score) %>%
add_markers()

# Filter out the 2016 video games


vg2016 <- vgsales %>%
filter(Year == 2016)

# Create a stacked bar chart of Rating by Genre


vg2016 %>%
count(Genre, Rating) %>%
plot_ly(x = ~Genre, y = ~n, color = ~Rating) %>%
add_bars() %>% layout(barmode = 'stack')

# Filter out the 2016 video games


vg2016 <- vgsales %>%
filter(Year == 2016)

# Create boxplots of Global_Sales by Genre for above data


vg2016 %>%
plot_ly(x = ~Global_Sales, y = ~Genre) %>%
add_boxplot()

# Create a histogram of Critic_Score with navy bars that are 50% transparent
vgsales2016 %>%
plot_ly(x = ~Critic_Score) %>%
add_histogram(color = I('navy'), opacity = .5)

# Change the color of the histogram using rgb()


vgsales2016 %>%
plot_ly(x = ~Critic_Score) %>%
add_histogram(marker = list(color = "rgb(17, 30, 108)" ))

# Set the plotting symbol to diamond and the size to 4


plot_ly(data = vg2016, x = ~User_Score, y = ~Critic_Score) %>%
add_markers(marker = list(size = 4, symbol = 'diamond'))

# Use color to add Genre as a third variable


vgsales2016 %>%
plot_ly(y = ~User_Score, x = ~Critic_Score,color = ~Genre) %>%
add_markers(colors = "Dark2")

# Create a scatterplot of User_Score against Critic_Score coded by Rating


vgsales2016 %>%
plot_ly(y = ~User_Score, x = ~Critic_Score,symbol = ~Rating) %>%
add_markers(colors = "Dark2")
# Create a scatterplot of User_Score vs. Critic_Score colored by log User_Count
vgsales2016 %>%
plot_ly(x = ~Critic_Score, y = ~User_Score, color = ~log(User_Count)) %>%
add_markers()

# Create a bar chart of Platform with hoverinfo only for the bar heights
vgsales2016 %>%
count(Platform) %>%
plot_ly(x = ~Platform, y = ~n, hoverinfo = 'y') %>%
add_bars()

# Create a scatterplot of User_Score vs. Critic score


vgsales2016 %>%
# Add video game Name to the hover info text
plot_ly(x = ~Critic_Score, y = ~User_Score, hoverinfo = "text", text =
~Name )%>%
add_markers()

# Format the hover info for NA_Sales, EU_Sales, and Name


vgsales2016 %>%
plot_ly(x = ~NA_Sales, y = ~EU_Sales,
hoverinfo = 'text',
text = ~paste("NA_Sales: ", NA_Sales, "<br>",
"EU_Sales: ", EU_Sales, "<br>",
"Name: ", Name)
) %>%
add_markers()

# Polish the scatterplot by transforming the x-axis and labeling both axes
vgsales2016 %>%
plot_ly(x = ~Global_Sales, y = ~Critic_Score) %>%
add_markers(marker = list(opacity = 0.5)) %>%
layout(xaxis = list(title = "Global sales (millions of units)", type = "log"),
yaxis = list(title = "Critic score"))

# Set the background color to #ebebeb and remove the vertical grid
annual_vgsales %>%
plot_ly(x = ~Year, y = ~Global_Sales) %>%
add_lines() %>%
layout( xaxis = list(showgrid = F), paper_bgcolor = "#ebebeb")

# Fit the regression model of User_Score on Critic_Score


m <- lm(User_Score ~ Critic_Score, data = vgsales2016)

# Create the scatterplot with smoother


vgsales2016 %>%
select(User_Score, Critic_Score) %>%
na.omit() %>%
plot_ly(x = ~Critic_Score, y = ~User_Score) %>%
add_markers(showlegend = FALSE) %>%
add_lines(y = ~fitted(m) )

# Compute density curves


d.a <- density(activision$Critic_Score, na.rm = TRUE)
d.e <- density(ea$Critic_Score, na.rm = TRUE)
d.n <- density(nintendo$Critic_Score, na.rm = TRUE)

# Overlay density plots


plot_ly() %>%
add_lines(x = ~d.a$x, y = ~d.a$y, name = "Activision", fill = 'tozeroy') %>%
add_lines(x = ~d.e$x, y = ~d.e$y, name = "Electronic Arts", fill = 'tozeroy') %>%
add_lines(x = ~d.n$x, y = ~d.n$y, name = "Nintendo", fill = 'tozeroy') %>%
layout(xaxis = list(title = 'Critic Score'),
yaxis = list(title = 'Density'))

# Create a faceted scatterplot of User_Score vs. Critic_Score with 3 rows


vgsales2016 %>%
group_by(Platform) %>%
do( plot = plot_ly(data = ., x = ~Critic_Score, y = ~User_Score ) %>%
add_markers(name = ~Platform)
) %>%
subplot(nrows = 3, shareY = TRUE, shareX = TRUE)

# Add x-axis and y-axis labels, and a title


subplot(p1, p2, nrows = 2, shareX = T, shareY = T) %>%
layout(title = "User score vs. critic score by platform, 2016")

# Add x-axis and y-axis labels, and a title to sp2


sp2 %>%
layout(
xaxis = list(title = ""),
xaxis2 = list(title = "Year"),
yaxis = list(title = "Global Sales (M units)"),
yaxis2 = list(title = "Global Sales (M units)")
)

# Create a SPLOM of NA_Sales, EU_Sales, and JP_Sales


vgsales2016 %>%
plot_ly() %>%
add_trace(
type = 'splom',
dimensions = list(
list(label = "N. America", values = ~NA_Sales),
list(label = "Europe", values = ~EU_Sales),
list(label = "Japan", values = ~JP_Sales)
)
)

# Color the SPLOM of NA_Sales, EU_Sales, and JP_Sales by nintendo


vgsales2016 %>%
mutate(nintendo = ifelse(Publisher == "Nintendo", "Nintendo", "Other")) %>%
plot_ly(color = ~nintendo) %>%
add_trace(
type = 'splom',
dimensions = list(
list(label = "N. America", values = ~NA_Sales),
list(label = "Europe", values = ~EU_Sales),
list(label = "Japan", values = ~JP_Sales)
)
)
splom %>% style(diagonal = list(visible = F))
# Delete the plots in the upper half of splom
splom %>% style(showupperhalf = F)
# Delete the plots in the lower half of splom
splom %>% style(showlowerhalf = F)

# Create a binned scatterplot of User_Score vs. Critic_Score


vgsales %>%
plot_ly(x = ~Critic_Score, y = ~User_Score) %>%
add_histogram2d(nbinsx = 50, nbinsy = 50)

vgsales %>%
plot_ly(x = ~Critic_Score, y = ~User_Score) %>%
add_histogram2dcontour(plotly)

###COMPARE
turnout %>%
plot_ly() %>%
add_markers(x = ~turnout2014, y = ~turnout2018 ) %>%
layout(xaxis = list(title = '2014 voter turnout'),
yaxis = list(title = '2018 voter turnout'))

# Create a scatterplot of turnout2018 against turnout2014


turnout %>%
plot_ly(x = ~turnout2014, y = ~turnout2018) %>%
add_markers() %>%
layout(xaxis = list(title = "2014 voter turnout"),
yaxis = list(title = "2018 voter turnout"))

# Add the line y = x to the scatterplot


p <- turnout %>%
plot_ly(x = ~turnout2014, y = ~turnout2018) %>%
add_markers() %>%
layout(xaxis = list(title = "2014 voter turnout"),
yaxis = list(title = "2018 voter turnout"))
p %>%
add_lines(x = c(.25, .6), y = c(.25, .6)) %>%
layout(showlegend = FALSE)

# Create a dotplot of voter turnout in 2018 by state ordered by turnout


turnout %>%
top_n(15, wt = turnout2018) %>%
plot_ly(x = ~turnout2018, y = ~fct_reorder(state, turnout2018)) %>%
add_markers() %>%
layout(xaxis = list(title = 'Eligible voter turnout'),
yaxis = list(title = 'State', type = 'category'))

# Create a histogram of receipts for the senate races


fundraising %>%
filter(office == 'S') %>%
plot_ly( x = ~receipts) %>%
add_histogram %>%
layout(xaxis = list(title = 'Total contributions received'),
title = 'Fundraising for 2018 Senate races')

# Create a dotplot of the top 15 Senate campaigns


fundraising %>%
filter(office == "S") %>%
top_n(15, wt = receipts) %>%
plot_ly(x = ~receipts, y = ~fct_reorder(state, receipts),
color = ~fct_drop(party),
hoverinfo = "text",
text = ~paste("Candidate:", name, "<br>",
"Party:", party, "<br>",
"Receipts:", receipts, "<br>",
"Disbursements:", disbursement)) %>%
add_markers(colors = c('blue', 'red'))

# Create a choropleth map of the change in voter turnout from 2014 to 2018
turnout %>%
mutate(change = turnout2018 - turnout2014) %>%
plot_geo(locationmode = 'USA-states') %>%
add_trace(z = ~change, locations = ~state.abbr) %>%
layout(geo = list(scope = 'usa'))

# Create a choropleth map displaying the Senate results


senate_winners %>%
plot_geo(locationmode = 'USA-states') %>%
add_trace(z = ~as.numeric(party), locations = ~state,
colors = c('dodgerblue', 'mediumseagreen', 'tomato'),
hoverinfo = "text",
text = ~paste("Candidate:", name, "<br>",
"Party:", party, "<br>",
"% vote:", round(pct.vote, 1))
) %>%
layout(geo = list(scope = 'usa')) %>%
hide_colorbar()

# Map President Trump's rallies in 2018


rallies2018 %>%
plot_geo(locationmode = 'USA-states') %>%
add_markers(
x = ~long, y = ~lat, size = ~no.speakers,
hoverinfo = "text", text = ~paste(city, state, sep = ",")
) %>%
layout(title = '2018 Trump Rallies', geo = list(scope = 'usa'))

# Customize the geo layout


g <- list(scope = 'usa',
showland = T, landcolor = toRGB('gray90'),
showlakes = T, lakecolor = toRGB('white'),
showsubunit = T, subunitcolor = toRGB('white'))

# Apply the geo layout to the map


rallies2018 %>%
plot_geo(locationmode = 'USA-states') %>%
add_markers(
x = ~long, y = ~lat, size = ~no.speakers,
hoverinfo = "text", text = ~paste(city, state, sep = ",")
) %>%
layout(title = "2018 Trump Rallies", geo = g )
# Create a choropleth map displaying the Senate winners
senate_map %>%
group_by(group) %>%
plot_ly(x = ~long, y = ~lat, color = ~party, split = ~region) %>%
add_polygons(line = list(width = .4), showlegend = F)

# Adjust the polygon colors and boundaries


senate_map %>%
group_by(group) %>%
plot_ly(x = ~long, y = ~lat, color = ~party, split = ~region,
colors = c('dodgerblue', 'mediumseagreen', 'tomato')) %>%
add_polygons(line = list(width = 0.4, color = toRGB('gray60')), showlegend =
FALSE)

# Define the layout settings to polish the axes


map_axes <- list(title = '', showgrid = F, zeroline = F, showticklabels = F)

# Apply the layout to both axes


senate_map %>%
group_by(group) %>%
plot_ly(x = ~long, y = ~lat, color = ~party, split = ~region,
colors = c("dodgerblue", "mediumseagreen", "tomato")) %>%
add_polygons(line = list(width = 0.4, color = toRGB("gray60")), showlegend =
FALSE) %>%
layout(xaxis = map_axes, yaxis = map_axes)

# Join the fl_boundaries and fl_results data frames


senate_vote <- left_join(fl_boundaries, fl_results, by = c("subregion" =
"CountyName"))

# Specify the axis settings to polish the map


map_axes <- list(title = '', showgrid = F, zeroline = F, showticklabels = F)

# Create a polished county-level choropleth map of Pctvote


senate_vote %>%
group_by(group) %>%
plot_ly(x = ~long, y = ~lat, color = ~Pctvote, split = ~subregion) %>%
add_polygons(line = list(width = 0.4), showlegend = FALSE, colors = c("blue",
"red")) %>%
layout(xaxis = map_axes, yaxis = map_axes)

###HIGHCHART

# Load the highcharter package


library(highcharter)
# Build a candlestick chart
hchart(xlk_prices, type = "candlestick")
# Build a ohlc chart
hchart(xlk_prices, type = "ohlc")
# Build a line chart
hchart(xlk_prices$close, type = 'line')

## highchart() vs hchart()
# Show the dates
index(xlk_prices)

# Use the base function and set the correct chart type
highchart(type = 'stock') %>%
# Add the price data
hc_add_series(xlk_prices)
# Create a line chart of the 'close' prices
hchart(xlk_prices_tibble, hcaes(x = date, y = close), type = "line")
# Create a line chart of the open prices
hchart(xlk_prices_tibble, hcaes(x = date, y = open), type = 'line')

# Chart the price of KO


highchart(type = 'stock') %>%
hc_add_series(stock_prices_xts$KO)
# GOOG in green
highchart(type = 'stock') %>%
hc_add_series(stock_prices_xts$GOOG, color = 'green')
# Fill in the complete highchart code flow to chart DIS in purple
highchart(type = 'stock') %>%
hc_add_series(stock_prices_xts$DIS, color = 'purple')

highchart(type = "stock") %>%


# Add JPM as a blue line called JP Morgan
hc_add_series(stock_prices_xts$JPM, name = 'JP Morgan', color = 'blue')

highchart(type = "stock") %>%


# Supply the text of the title to hc_title()
hc_title(text = "A history of two stocks") %>%
# Supply the text of the subtitle to hc_subtitle()
hc_subtitle(text = "told with lines") %>%
hc_add_series(stock_prices_xts$AMZN, color = "blue", name = "AMZN") %>%
hc_add_series(stock_prices_xts$DIS, color = "red", name = "DIS") %>%
# Supply the text and format of the y-axis
hc_yAxis(title = list(text = "Prices (USD)"),
labels = list(format = "${value}"),
opposite = F)

highchart(type = "stock") %>%


hc_add_series(stock_prices_xts$AMZN, color = "blue", name = "AMZN") %>%
hc_add_series(stock_prices_xts$DIS, color = "red", name = "DIS") %>%
# Add the dollar sign and y-values on a new line
hc_tooltip(pointFormat = "Daily Price:<br>
$ {point.y}")

highchart(type = "stock") %>%


hc_add_series(stock_prices_xts$AMZN, color = "blue", name = "AMZN") %>%
hc_add_series(stock_prices_xts$DIS, color = "red", name = "DIS") %>%
hc_add_series(stock_prices_xts$GOOG, color = "green", name = "GOOG") %>%
hc_tooltip(pointFormat = "{point.series.name}: ${point.y: .2f}") %>%
hc_legend(enabled = T)

# Visualize DIS as a line chart


hchart(stock_wide_tibble_prices, hcaes(x = date, y = DIS),
type = "line", name = "DIS", color = 'orange')
# Create a line chart of KO
hchart(stock_wide_tibble_prices, hcaes(x = date, y = KO), name = "KO", type =
"line") %>%
# Add JPM DIS AMZN to the chart # Enable a shared tooltip
hc_add_series(stock_wide_tibble_prices, hcaes(x = date, y = JPM),name =
"JPM", type = "line") %>%
hc_add_series(stock_wide_tibble_prices, hcaes(x = date, y = DIS), name =
"DIS", type = "line") %>%
hc_add_series(stock_wide_tibble_prices, hcaes(x = date, y = AMZN), name =
"AMZN", type = "line") %>%
hc_tooltip(shared = TRUE)

# Add JPM to the chart # Enable a shared tooltip # Change the text of the
title of the y-axis
hchart(stock_wide_tibble_prices, hcaes(x = date, y = KO), name = "KO", type =
"line") %>%
hc_add_series(stock_wide_tibble_prices, hcaes(x = date, y = JPM), name =
'JPM', type = "line") %>%
hc_tooltip(shared = T, pointFormat = "{point.name.series}: ${point.y: .
2f}<br>") %>%
hc_yAxis(title = list(text = "prices (USD)"))

# Specify a green scatter plot


hchart(stock_wide_tibble_returns, hcaes(x = GOOG, y = JPM), type = 'scatter',
color = 'green', name = "GOOG v. JPM") %>%
hc_tooltip(pointFormat = "GOOG: {point.x: .2f}% <br> JPM: {point.y: .2f}%")

hchart(stock_wide_tibble_returns, hcaes(x = KO, y = AMZN), type = "scatter",


color = 'pink', name = "GOOG v. AMZN") %>%
hc_tooltip(pointFormat = "{point.date} <br> AMZN: {point.y: .2f}% <br> KO:
{point.x: .2f}%")

# Create a scatter plot


hchart(stock_wide_tibble_returns, hcaes(x = KO, y = GOOG), type = "scatter") %>%
# Add the slope variable
hc_add_series(stock_wide_tibble_returns, hcaes(x = KO, y = (KO * 1.15)), type
= "line") %>%
# Customize the tooltip to show the date, x-, and y-values
hc_tooltip(pointFormat = "{point.date} <br>
GOOG {point.y: .2f}% <br>
KO: {point.x: .2f}%")

hchart(stock_wide_tibble_returns, hcaes(x = AMZN, y = DIS), type = "scatter") %>%


hc_add_series(stock_wide_tibble_returns, hcaes(x = AMZN, y = (AMZN * .492)),
type = "line",
tooltip = list(
headerFormat = "DIS/AMZN linear relationship<br>",
pointFormat = "{point.y: .2f}%")) %>%
# Customize the scatter tooltip
hc_tooltip(pointFormat = "{point.date} <br> DIS: {point.y}% <br> AMZN:
{point.x}%")

# Start the hchart flow for the returns data


hchart(commodities_returns, type = "scatter", hcaes(x = gold, y = palladium, date
= date), color = "pink") %>%
hc_tooltip(pointFormat = "date: {point.date} <br>
palladium: {point.y:.4f} <br>
gold: {point.x:.4f} ") %>%
hc_title(text = "Palladium Versus Gold 2017")

stock_tidy_tibble_prices %>%
hchart(., hcaes(x = date, y = price, group = symbol), type = "line") %>%
hc_title(text = "Daily Prices from Tidy Tibble") %>%
hc_yAxis(title = list(text = "Prices (USD)"),
labels = list(format = "${value}"),
opposite = FALSE)

## grouped_df

stock_tidy_tibble_returns %>%
# Calculate the standard deviation and mean of returns
summarize(std_dev = sd(returns),
mean = mean(returns)) %>%
hchart(., hcaes(x = symbol, y = std_dev, group = symbol, size = mean), type =
"scatter") %>%
hc_title(text = "Standard Dev and Mean Return")

stock_tidy_tibble_returns %>%
summarize(avg_returns = mean(returns),
risk = sd(returns),
risk_return = risk/avg_returns) %>%
# Pass the summary statistics to hchart # Color by symbol
hchart(., hcaes(x = symbol, y = risk_return, group = symbol), type =
'column') %>%
hc_title(text = "Risk/Return") %>%
hc_subtitle(text = "lower bars are better")

stock_tidy_tibble_prices %>%
mutate(sector = case_when(symbol == "AMZN" ~ "tech",
symbol == "GOOG" ~ "tech",
symbol == "DIS" ~ "fun",
symbol == "JPM" ~ "bank",
symbol == "KO" ~ "food")) %>%
hchart(., hcaes(x = date, y = price, group = symbol), type = "line") %>%
hc_tooltip(pointFormat = "{point.symbol}: ${point.price: .2f}<br> sector:
{point.sector}")

# Calculate the mean, sd, max and min returns


stock_tidy_tibble_returns %>%
summarize(mean = mean(returns),
st_dev = sd(returns),
max_return = max(returns),
min_return = min(returns) ) %>%
# Pass the summarized to data to hchart()
hchart(., hcaes(x = symbol, y = st_dev, group = symbol), type = "column") %>
%
# Customize the tooltip to show mean, max and minimum daily returns
hc_tooltip(pointFormat = "mean: {point.mean: .4f}% <br>
max: {point.max_return: .4f}% <br>
min: {point.min_return: .4f}% ")

# Pass the tidy tibble to hchart()


hchart(commodities_returns_tidy, hcaes(x = date, y = return, date = date, group =
metal),
type = 'scatter') %>%
hc_title(text = "Gold, Palladium and Platinum Returns 2017") %>%
hc_tooltip(pointFormat = "date: {point.date} <br>
### {point.metal}: {point.return:.4f}")
{point.metal}: {point.y:.4f}")

######
### MARKETING
## RESPONSE MODEL

# Extend the sales resonse model


extended.model <- lm(log(SALES) ~ PRICE + Price.lag + DISPLAY + Display.lag +
COUPON + Coupon.lag + DISPLAYCOUPON + DisplayCoupon.lag, data = sales.data)

# Obtain the model predictions


predicted.values <- c(NA,fitted.values(extended.model))

# Plot log(SALES) against the running index


plot(log(SALES) ~ 1, data = sales.data)

# Add the model predictions to the plot


lines(predicted.values ~ 1)

# Extend the sales resonse model


extended.model <- lm(log(SALES) ~ PRICE + Price.lag + DISPLAY + Display.lag +
COUPON + Coupon.lag + DISPLAYCOUPON + DisplayCoupon.lag, data = sales.data)
# Obtain the AIC # Update the AIC by single term deletion
AIC(extended.model)
AIC(update(extended.model, . ~ . -Coupon.lag))

# Load the MASS package


library(MASS)
# Backward elemination
final.model <- stepAIC(extended.model, direction = 'backward', trace = FALSE)
summary(final.model)

##
colMeans(choice.data[c('HOPPINESS','BUD','PRICE.HOP','PRICE.BUD')])
price.ratio <- log(choice.data$PRICE.HOP/choice.data$PRICE.BUD)
head(cbind(price.ratio, choice.data$PRICE.BUD, choice.data$PRICE.HOP))

# Explain HOPPINESS by price.ratio


probability.model <- lm(HOPPINESS ~ price.ratio, data = choice.data)

# Plot HOPPINESS against price.ratio # Add the model predictions


plot(HOPPINESS ~ price.ratio, data = choice.data)
abline(probability.model)

# Plot HOPPINESS against price.ratio


plot(HOPPINESS ~ price.ratio, data = choice.data)

# Add the logistic response function ## price.ratio = x!!!


curve(predict(logistic.model, data.frame(price.ratio = x), type = "response"), add
= TRUE)

library(margins)
coef(probability.model)
margins(logistic.model)

> library(margins)
> coef(probability.model)
(Intercept) price.ratio
0.09700236 -0.29594939
> margins(logistic.model)
Average marginal effects
glm(formula = HOPPINESS ~ price.ratio, family = binomial, data = choice.data)
price.ratio
-0.4585

## The price ratio effect of the logistic model is similar in size to the price
ratio coefficient of the linear probability model. On average, the purchase
probability for Hoppiness increases around 46%, if the price ratio decreases one
unit. This is much more than obtained for the linear probability model.

# Define the sequence of x values


x <- seq(from = -2, to = 2, by = .5)
# Plot the price.ratio effect
cplot(logistic.model, 'price.ratio', xvals = x)

# Explain HOPPINESS by price.ratio


probit.model <- glm(HOPPINESS ~ price.ratio, family = binomial(link = probit), data
= choice.data)

# Obtain the coefficients


coef(probit.model)

margins(logistic.model)
margins(probit.model)

# Backward elemination
final.model <- stepAIC(extended.model, direction = 'backward', trace = TRUE)
summary(final.model)

# Classify the predictions


predicted <- ifelse(fitted(extended.model) >= .5, 1, 0)

# Obtain the number of purchase events


table(predicted)
mean(predicted)

# Obtain the observed purchases


observed <- choice.data$HOPPINESS

# Cross-tabulate observed vs. predicted purchases


table(predicted, observed)/2798
prop.table(table(predicted, observed))

# Load the pROC package


library(pROC)
# Obtain the observed purchases
observed <- choice.data$HOPPINESS

# Create the Roc object


ROC <- roc(predictor = fitted(extended.model), response = observed)
plot(ROC)

# Create the training data, test data


train.data <- subset(choice.data, LASTPURCHASE == 0)
test.data <- subset(choice.data, LASTPURCHASE == 1)

# Fit the logistic response model to train.data


train.model <- glm(HOPPINESS ~ price.ratio + FEAT.HOP + FEATDISPL.HOP, family =
binomial, data = train.data)
margins(train.model)

# Investigate the extended.model


margins(extended.model)

# Predict the purchase probabilities for test.data


probability <- predict(train.model, test.data, type = "response")

# Classify the predictions


predicted <- ifelse(probability >= .5, 1, 0)

# Obtain the observed purchases from test.data


observed <- test.data$HOPPINESS

# Cross-tabulate observed vs. predicted purchases


prop.table(table(predicted, observed))
#############

SPATIAL R SF RASTER

# Load the sf package


# Read in the trees shapefile, neighborhood, parks
library(sf)
trees <- st_read("trees.shp")
neighborhoods <- st_read("neighborhoods.shp")
parks <- st_read("parks.shp")

# Load the raster package


# Read in the tree canopy single-band raster, manhattan multi-band
library(raster)
canopy <- raster("canopy.tif")
manhattan <- brick("manhattan.tif")

# Get the class for the new objects


class(canopy)
class(manhattan)
# Identify how many layers each object has
nlayers(canopy)
nlayers(manhattan)

library(dplyr)
library(sf)
# Read in the trees shapefile
trees <- st_read("trees.shp")
# Use filter() to limit to honey locust trees
honeylocust <- trees %>% filter(species == "honeylocust")
# Count the number of rows
nrow(honeylocust)

# Create a standard, non-spatial data frame with one column


df <- data.frame(a = 1:3)
# Add a list column to your data frame
df$b <- list(1:4, 1:5, 1:10)
# Convert your data frame to a tibble and print on console
as_tibble(df)
# Pull out the third observation from both columns individually
df$a[3]
df$b[3]

# Read in the parks shapefile


parks <- st_read("parks.shp")
# Compute the areas of the parks # Create a quick histogram of the areas using hist
areas <- st_area(parks)
hist(areas, xlim = c(0, 200000), breaks = 1000)
# Filter to parks greater than 30000 (square meters) # Plot just the geometry of
big_parks
big_parks <- parks %>% filter(unclass(areas) > 30000)
plot(st_geometry(big_parks))

### Computing geo-information vector can be done with functions like st_area() and
st_length(). The result can be used in additional calculations. !!!Careful: the
result is a units object that requires additional processing like using unclass().

# Plot the parks object using all defaults


plot(parks)
# Plot just the acres attribute of the parks data
plot(parks["acres"])
# Create a new object of just the parks geometry # Plot the geometry of the parks
data
parks_geo <- st_geometry(parks)
plot(parks_geo)

### these plots are not pretty but you can't beat plot() for a quick look using
few keystrokes. And remember you can use plot(st_geometry(geo_object)) to plot just
the geometry of your object.

# Load the raster package


library(raster)
canopy <- raster("canopy.tif")
manhattan <- brick("manhattan.tif")

# Get the extent of the canopy object # Get the CRS of the manhattan object
extent(canopy)
crs(manhattan)
# Determine the number of grid cells in both raster objects
ncell(manhattan)
ncell(canopy)
# Check if the data is in memory
inMemory(canopy)
# Use getValues() to read the values into a vector # Use hist() to create a
histogram of the values
vals <- getValues(canopy)
hist(vals)

plot(canopy)
plot(manhattan) # Plot a single image for each layer
plotRGB(manhattan) # Plot the manhattan raster as an image

# Determine the CRS for the neighborhoods and trees vector objects
st_crs(neighborhoods)
st_crs(trees)

# Assign CRS to trees


crs_1 <- "+proj=longlat +ellps=WGS84 +no_defs"
st_crs(trees) <- crs_1

# Determine the CRS for the canopy and manhattan rasters


crs(canopy)
crs(manhattan)

# Assign CRS to manhattan


crs_2 <- "+proj=utm +zone=18 +ellps=GRS80 +datum=NAD83 +units=m +no_defs"
crs(manhattan) <- crs_2

# Get the CRS from the canopy object


the_crs <- crs(canopy, asText = TRUE)

# Project trees to match the CRS of canopy


trees_crs <- st_transform(trees, crs = the_crs)

# Project neighborhoods to match the CRS of canopy


neighborhoods_crs <- st_transform(neighborhoods, crs = the_crs)

# Project manhattan to match the CRS of canopy


manhattan_crs <- projectRaster(manhattan, crs = the_crs, method = "ngb")

# Look at the CRS to see if they match


st_crs(trees_crs)
st_crs(neighborhoods_crs)
crs(manhattan_crs)

# Plot canopy and neighborhoods (run both lines together)


# Do you see the neighborhoods?
plot(canopy)
plot(neighborhoods, add = TRUE)

# See if canopy and neighborhoods share a CRS


st_crs(neighborhoods)
st_crs(canopy)
# Save the CRS of the canopy layer
the_crs <- crs(canopy, asText = T)

# Transform the neighborhoods CRS to match canopy


neighborhoods_crs <- st_transform(neighborhoods, crs = the_crs)

# Re-run plotting code (run both lines together)


# Do the neighborhoods show up now?
plot(canopy)
plot(neighborhoods_crs, add = TRUE)

# Simply run the tmap code


tm_shape(canopy) +
tm_raster() +
tm_shape(neighborhoods_crs) +
tm_polygons(alpha = 0.5)

####
# Create a data frame of counts by species
species_counts <-count(trees, species, sort = T)

# Drop the geometry column


species_no_geometry <- st_set_geometry(species_counts, NULL)
head(species_no_geometry)

# Limit to the fields boro_name, county_fip and boro_code


boro <- select(neighborhoods, boro_name, county_fip, boro_code)
# Drop the geometry column
boro_no_geometry <- st_set_geometry(boro, NULL)
# Limit to distinct records
boro_distinct <- distinct(boro_no_geometry)

# Join the county detail into the trees object


trees_with_county <- inner_join(trees, boro_distinct, by = c("boroname" =
"boro_name"))
head(trees_with_county)

# Plot the neighborhoods geometry


plot(st_geometry(neighborhoods), col = "grey")

# Measure the size of the neighborhoods object


object_size(neighborhoods)

# Compute the number of vertices in the neighborhoods object


pts_neighborhoods <- st_cast(neighborhoods$geometry, "MULTIPOINT")
cnt_neighborhoods <- sapply(pts_neighborhoods, length)
sum(cnt_neighborhoods)

# Simplify the neighborhoods object


neighborhoods_simple <- st_simplify(neighborhoods, preserveTopology = T, dTolerance
= 100)
object_size(neighborhoods_simple)

# Compute the number of vertices in the neighborhoods_simple object


pts_neighborhoods_simple <- st_cast(neighborhoods_simple$geometry, "MULTIPOINT")
cnt_neighborhoods_simple <- sapply(pts_neighborhoods_simple, length)
sum(cnt_neighborhoods_simple)
# Plot the neighborhoods_simple object geometry
plot(st_geometry(neighborhoods_simple), col = "grey")

# Read in the trees data


trees <- st_read("trees.shp")

# Convert to Spatial class


trees_sp <- as(trees, Class = 'Spatial')

# Confirm conversion, should be "SpatialPointsDataFrame"


class(trees_sp)

# Convert back to sf
trees_sf <- st_as_sf(trees_sp)

# Confirm conversion
class(trees_sf)

# Read in the CSV


trees <- read.csv("trees.csv")

# Convert the data frame to an sf object


trees_sf <- st_as_sf(trees, coords = c("longitude","latitude"), crs = 4326)

# Plot the geometry of the points


plot(st_geometry(trees_sf))

# Write the file out with coordinates


st_write(trees_sf, "new_trees.csv", layer_options = 'GEOMETRY=AS_XY', delete_dsn =
TRUE)

# Read in the file you just created and check coordinates


new_trees <- read.csv("new_trees.csv")
head(new_trees)

# Read in the canopy layer


canopy <- st_read("canopy.tif")

# Plot the canopy raster


plot(canopy)
res(canopy)
ncell(canopy)

# Aggregate the raster


canopy_small <- aggregate(canopy, fact = 10)
plot(canopy_small)

# Reclassify # Set up the matrix


vals <- cbind(100, 300, NA)
canopy_reclass <- reclassify(canopy, rcl = vals)
plot(canopy_reclass)
# Review df
df

# Convert the data frame to an sf object


df_sf <- st_as_sf(df, coords = c("longitude", "latitude"), crs = 4326)

# Transform the points to match the manhattan CRS


df_crs <- st_transform(df_sf, crs = crs(manhattan, asText = TRUE))

# Buffer the points


df_buf <- st_buffer(df_crs, dist = 1000)

# Plot the manhattan image (it is multi-band)


plotRGBB(manhattan)
plot(st_geometry(df_buf), col = "firebrick", add = TRUE)
plot(st_geometry(df_crs), pch = 16, add = TRUE)

# Read in the neighborhods shapefile


neighborhoods <- st_read('neighborhoods.shp')

# Project neighborhoods to match manhattan


neighborhoods_tf <- st_transform(neighborhoods, crs = 32618)

# Compute the neighborhood centroids


centroids <- st_centroid(neighborhoods_tf)

# Plot the neighborhood geometry


plot(st_geometry(neighborhoods_tf), col = "grey", border = "white")
plot(centroids, pch = 16, col = "firebrick", add = T)

# Plot the neighborhoods and beech trees


plot(st_geometry(neighborhoods), col = "grey", border = "white")
plot(beech, add = T, pch = 16, col = "forestgreen")

# Compute the coordinates of the bounding box


st_bbox(beech)

# Create a bounding box polygon


beech_box <- st_make_grid(beech, n = 1)

# Plot the neighborhoods, add the beech trees and add the new box
plot(st_geometry(neighborhoods), col = "grey", border = "white")
plot(beech, add = T, pch = 16, col = "forestgreen")
plot(beech_box, add = T)

# Buffer the beech trees by 3000


beech_buffer <- st_buffer(beech, 3000)

# Limit the object to just geometry


beech_buffers <- st_geometry(beech_buffer)

# Compute the number of features in beech_buffer


length(beech_buffers)
# Plot the tree buffers
plot(beech_buffers)

# Dissolve the buffers


beech_buf_union <- st_union(beech_buffers)

# Compute the number of features in beech_buf_union


length(beech_buf_union)

# Plot the dissolved buffers


plot(beech_buf_union)

# Convert the points to a single multi-point


beech1 <- st_union(beech)

# Compute the tight bounding box


beech_hull <- st_convex_hull(beech1)

# Plot the points together with the hull


plot(beech_hull, col = "red")
plot(beech1, add = T)

## When computing a convex hull, remember to first dissolve/union/combine


individual features into a multi-feature. head() you should have noticed that you
started with “POINT” objects and after st_union() you had “MULTIPOINT”.

Plot the beech trees (beech) on top of the neighborhoods (neighborhoods). You will
want to plot only the geometry of the neighborhoods.
Use class() to see if the beech object has class data.frame or if it's just
geometry.
Convert the sf geometry object beech to an sf data frame with st_sf().
Use class() to confirm that beech now has a class of data.frame (as well as sf).
Use st_join() to conduct a spatial join in order to add neighborhood information to
the beech object.
Use head() to confirm that the new object has neighborhood information -- for
example, it should now have neighborhood name (ntaname).

# Plot the beech on top of the neighborhoods


plot(st_geometry(neighborhoods))
plot(beech, add = T, pch = 16, col = "red")

# Determine whether beech has class data.frame


class(beech)

# Convert the beech geometry to a sf data frame


beech_df <- st_sf(beech) ###

# Confirm that beech now has the data.frame class


class(beech_df)

# Join the beech trees with the neighborhoods


beech_neigh <- st_join(beech_df, neighborhoods)

# Confirm that beech_neigh has the neighborhood information


head(beech_neigh)

# Identify neighborhoods that intersect with the buffer


neighborhoods_int <- st_intersects(buf, neighborhoods)

# Identify neighborhoods contained by the buffer


neighborhoods_cont <- st_contains(buf, neighborhoods)

# Get the indexes of which neighborhoods intersect


# and are contained by the buffer
int <- neighborhoods_int[[1]]
cont <- neighborhoods_cont[[1]]

# Get the names of the names of neighborhoods in buffer


neighborhoods$ntaname[int]

# Clip the neighborhood layer by the buffer (ignore the warning)


neighborhoods_clip <- st_intersection(buf, neighborhoods)

# Plot the geometry of the clipped neighborhoods


plot(st_geometry(neighborhoods_clip), col = "red")
plot(neighborhoods[cont,], add = TRUE, col = "yellow")

## A note about the output of functions that test relationships between two sets of
features. The output of these and related functions is a special kind of list (with
the class sgbp). For example, when using st_intersects(), the first element in the
output can be accessed using [[1]], which shows polygons from the second polygon
that intersect with the first polygon. Likewise, [[2]] would show the polygons from
from the first polygon that intersect with the second polygon.

# Read in the parks object


parks <- st_read('parks.shp')

# Test whether the CRS match


st_crs(empire_state) == st_crs(parks)

# Project parks to match empire state


parks_es <- st_transform(parks, crs = st_crs(empire_state))

# Compute the distance between empire_state and parks_es


d <- st_distance(empire_state, parks_es)

# Find the index of the nearest park


nearest <- which.min(d)

# Identify the park that is nearest


parks_es[nearest, ]

## Of course, measuring distance between feature sets is a component of spatial


analysis 101 -- a core skill for any analyst. There are several functions in base R
as well as in the packages rgeos and geosphere to compute distances, but the
st_distance() function from sf provides a useful feature-to-feature distance matrix
as output and can be used for most distance calculation needs. In this exercise
you'll measure the distance from the Empire State Building to all the parks and
identify the closest one.

## If you look at the result for the last line of code carefully you'll see that
the closest park is Greeley Square Park, it's just around the corner from the
Empire State Building. In this case one of our feature sets was a single feature.
You may end up applying this function in situations where there are multiple
features in both objects. In this situation sf will return a matrix.
# Project parks to match canopy
parks_cp <- st_transform(parks, crs = crs(canopy, asText = TRUE))

# Compute the area of the parks


areas <- st_area(parks_cp)

# Filter to parks with areas > 30000


parks_big <- filter(parks_cp, unclass(areas) > 30000)

# Plot the canopy raster


plot(canopy)

# Plot the geometry of parks_big


plot(st_geometry(parks_big))

# Convert parks to a Spatial object


parks_sp <- as(parks_big, "Spatial")

# Mask the canopy layer with parks_sp and save as canopy_mask


canopy_mask <- mask(canopy, mask = parks_sp)

# Plot canopy_mask -- this is a raster!


plot(canopy_mask)

# Convert the parks_big to a Spatial object


parks_sp <- as(parks_big, 'Spatial')

# Mask the canopy with the large parks


canopy_mask <- mask(canopy, mask = parks_sp)

# Plot the mask


plot(canopy_mask)

# Crop canopy with parks_sp


canopy_crop <- crop(canopy, parks_sp)

# Plot the cropped version and compare


plot(canopy_crop)

# Project the landmarks to match canopy


landmarks_cp <- st_transform(landmarks, crs = crs(canopy, asText = T))

# Convert the landmarks to a Spatial object


landmarks_sp <- as(landmarks_cp, 'Spatial')

# Extract the canopy values at the landmarks


landmarks_ex <- extract(canopy, landmarks_sp)

# Look at the landmarks and extraction results


landmarks_cp
landmarks_ex

# Read in the canopy and impervious layer


canopy <- raster('canopy.tif')
impervious <- raster('impervious.tif')

# Function f with 2 arguments and the raster math code


f <- function(rast1, rast2) {
rast1 < 20 & rast2 > 80
}

# Do the overlay using f as fun


canopy_imperv_overlay <- overlay(canopy, impervious, fun = f)

# Plot the result (low tree canopy and high impervious areas)
plot(canopy_imperv_overlay)

##You've now learned to perform raster math using the raster function overlay().
You limited to areas with < 20% tree canopy and > 80% impervious, these areas will
be the most urban areas of the city including parts of Manhattan and Brooklyn.

# Compute the counts of all trees by hood


tree_counts <- count(trees, hood)

# Take a quick look


head(tree_counts)

# Remove the geometry


tree_counts_no_geom <- st_set_geometry(tree_counts, NULL)

# Rename the n variable to tree_cnt


tree_counts_renamed <- rename(tree_counts_no_geom, tree_cnt = n)

# Create histograms of the total counts


hist(tree_counts_renamed$tree_cnt)

# Compute areas and unclass


areas <- unclass(st_area(neighborhoods))

# Add the areas to the neighborhoods object


neighborhoods_counts <- neighborhoods
%>% mutate(area = areas)
%>% left_join(tree_counts_renamed, by = "hood")
%>% mutate(tree_cnt = ifelse(is.na(tree_cnt),
0, tree_cnt))
%>% mutate(tree_density = tree_cnt/area)

# Transform the neighborhoods CRS to match the canopy layer


neighborhoods_crs <- st_transform(neighborhoods, crs = crs(canopy, asText = T))

# Convert neighborhoods object to a Spatial object


neighborhoods_sp <- as(neighborhoods_crs, 'Spatial')

# Compute the mean of canopy values by neighborhood # Add the mean canopy values to
neighborhoods
canopy_neighborhoods <- extract(canopy, neighborhoods_sp, fun = mean)
neighborhoods_avg_canopy <- mutate(neighborhoods, avg_canopy =
canopy_neighborhoods)
## Note that you transformed the neighborhoods object's CRS. This is actually not
strictly necessary because extract() can transform CRS on the fly. But it will be
needed for plotting and other operations later so doing manually is important here.

## GGPLOT
# Load the ggplot2 package
library(ggplot2)

# Create a histogram of tree density (tree_density), average canopy (avg_canopy)


ggplot(neighborhoods, aes(x = tree_density)) +
geom_histogram(color = "white")

ggplot(neighborhoods, aes(x = avg_canopy)) +


geom_histogram(color = "white")

# Create a scatter plot of tree_density vs avg_canopy


ggplot(neighborhoods, aes(x = avg_canopy, y = tree_density)) +
geom_point() +
stat_smooth()

# Compute the correlation between density and canopy


cor(neighborhoods$tree_density, neighborhoods$avg_canopy)

# Plot the tree density, tree canopy with default colors


ggplot(neighborhoods) + geom_sf(aes(fill = tree_density) )
ggplot(neighborhoods) + geom_sf(aes(fill = avg_canopy))

# Plot using scale_fill_gradient()


ggplot(neighborhoods) +
geom_sf(aes(fill = tree_density)) +
scale_fill_gradient(low = "#edf8e9", high = "#005a32")

ggplot(neighborhoods) +
geom_sf(aes(fill = avg_canopy)) +
scale_fill_gradient(low = "#edf8e9", high = "#005a32")

## TMAP
# Create a simple map of neighborhoods
tm_shape(neighborhoods) +
tm_polygons()

# Create a color-coded map of neighborhood tree density


tm_shape(neighborhoods) +
tm_polygons(col = 'tree_density') # watchout ''

# Style the tree density map


tm_shape(neighborhoods) +
tm_polygons("tree_density", palette = 'Greens',
style = "quantile", n = 7,
title = "Trees per sq. KM")

# Create a similar map of average tree canopy


tm_shape(neighborhoods) +
tm_polygons("avg_canopy", palette = "Greens",
style = 'quantile', n = 7,
title = "Average tree canopy (%)")
# Combine the aerial photo and neighborhoods into one map
map1 <- tm_shape(manhattan) +
tm_raster() +
tm_shape(neighborhoods) +
tm_borders(col = "black", lwd = 0.5, alpha = 0.5)

# Create the second map of tree measures


map2 <- tm_shape(neighborhoods, bbox = bbox(manhattan)) +
tm_polygons(c("tree_density", "avg_canopy"),
style = "quantile",
palette = "Greens",
title = c("Tree Density", "Average Tree Canopy"))

# Combine the two maps into one


tmap_arrange(map1, map2, asp = NA)

## Combine two maps with tmap_arrange(), asp = NA map height/width to match the
bounding box.

### LEAFLET

leaflet() %>% addTiles()

# Print the providers list included in the leaflet library


providers
# Use str_detect() to print only the provider tile names that include the string
"CartoDB"
names(providers)[str_detect(names(providers), "CartoDB")]

# Provider: 'CartoDB' 'Esri' 'CartoDB.PositronNoLabels'


leaflet() %>% addProviderTiles(provider = 'CartoDB')

# Map with CartoDB tile centered on DataCamp's NYC office with zoom of 6
leaflet() %>% addProviderTiles("CartoDB") %>%
setView(lng = -73.98575, lat = 40.74856, zoom = 6)

# Map with CartoDB.PositronNoLabels tile centered on DataCamp's Belgium office with


zoom of 4
leaflet() %>% addProviderTiles("CartoDB.PositronNoLabels") %>%
setView(lng = dc_hq$lon[2], lat = dc_hq$lat[2], zoom = 4)

# Set minZoom and dragging, Set default zoom level, set max bounds of map

leaflet(options = leafletOptions(minZoom = 12, dragging = FALSE)) %>%


addProviderTiles("CartoDB") %>%
setView(lng = dc_hq$lon[2], lat = dc_hq$lat[2], zoom = 14) %>%
setMaxBounds(lng1 = dc_hq$lon[2] + .05, lat1 = dc_hq$lat[2] + .05,
lng2 = dc_hq$lon[2] - .05, lat2 = dc_hq$lat[2] - .05)

# Create a dataframe called `ca` with data on only colleges in California


ca <- ipeds %>% filter(state == "CA")

# Use `addMarkers` to plot all of the colleges in `ca` on the `m` leaflet map
map %>% addMarkers(lng = ca$lng, lat = ca$lat)
# Center the map on LA
map %>% addMarkers(data = ca) %>%
setView(lat = la_coords$lat, lng = la_coords$lon, zoom = 12)

map_zoom <- map %>% addMarkers(data = ca) %>%


setView(lat = la_coords$lat, lng = la_coords$lon, zoom = 8)

map2 <- map %>% clearMarkers()

map2 %>% addCircleMarkers(lng = ca$lng, lat = ca$lat, radius = 2, col = 'red')


map_color <- map %>% addCircleMarkers(data = ca, radius = 2, color = "#2cb42c",
popup = ~name)

map2 %>% addCircleMarkers(data = ca, radius = 2, popup = ~paste0("<b>", name,


"</b>", "<br/>", sector_label))
map %>% addCircleMarkers(data = ca, radius = 2, label = ~name)
map %>% addCircleMarkers(data = ca, radius = 2, label = ~paste0(name, " (",
sector_label, ")"))

# Make a color palette called pal for the values of `sector_label` using
`colorFactor()`
# Colors "red", "blue", and "#9b4a11" for "Public", "Private", and "For-Profit"
pal <- colorFactor(palette = c("red", "blue", "#9b4a11"),
levels = c("Public", "Private", "For-Profit"))

# Add circle markers that color colleges using pal() and the values of sector_label
map2 <- map %>% addCircleMarkers(data = ca, radius = 2,
color = ~pal(sector_label), label = ~paste0(name, " (",
sector_label, ")"))

# Add legend that displays the colors used in pal


m %>% addLegend(pal = pal, values = c("Public", "Private", "For-Profit"))

# legend: opacity of .5, title of Sector, and position of topright


m %>% addLegend(pal = pal, values = c("Public", "Private", "For-Profit"),
opacity = 0.5, title = "Sector", position = "topright")

library(leaflet.extras)

leaflet() %>% addTiles() %>%


addSearchOSM() %>%
addReverseSearchOSM()

m2 <- ipeds %>% leaflet() %>% addProviderTiles('CartoDB') %>%


# center on the middle of the US with zoom of 3
setView(lat = 39.8282, lng = -98.5795, zoom = 3)
# Map all American colleges
m2 %>% addCircleMarkers()

pal <- colorFactor(palette = c("red", "blue", "#9b4a11"),


levels = c("Public", "Private", "For-Profit"))

m2 %>% addCircleMarkers(radius = 2, label = ~name, color = ~pal(sector_label))


# Load the htmltools package
library(htmltools)

# Create data frame called public with only public colleges


public <- filter(ipeds, sector_label == "Public")

# Create a leaflet map of public colleges called m3


m3 <- leaflet() %>%
addProviderTiles("CartoDB") %>%
addCircleMarkers(data = public, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "Public")

# Create data frame called private with only private colleges


private <- filter(ipeds, sector_label == "Private")

# Add private colleges to `m3` as a new layer


m3 <- m3 %>%
addCircleMarkers(data = private, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "Private") %>%
addLayersControl(overlayGroups = c("Public", "Private"))

# Create data frame called profit with only For-Profit colleges


profit <- filter(ipeds, sector_label == "For-Profit")

# Add For-Profit colleges to `m3` as a new layer


m3 <- m3 %>%
addCircleMarkers(data = profit, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "For-Profit") %>%
addLayersControl(overlayGroups = c("Public", "Private", "Profit"))

# Center the map on the middle of the US with a zoom of 4


m4 <- m3 %>%
setView(lat = 39.8282, lng = -98.5795, zoom = 4)

# Create data frame called profit with only For-Profit colleges


profit <- filter(ipeds, sector_label == "For-Profit")

leaflet() %>%
# Add the OSM, CartoDB and Esri tiles # Use addLayersControl to allow users to
toggle between basemaps
addTiles(group = "OSM") %>%
addProviderTiles('CartoDB', group = "CartoDB") %>%
addProviderTiles("Esri", group = "Esri") %>%
addLayersControl(baseGroups = c('OSM', 'CartoDB', 'Esri'))

m4 <- leaflet() %>%


addTiles(group = "OSM") %>%
addProviderTiles("CartoDB", group = "Carto") %>%
addProviderTiles("Esri", group = "Esri") %>%
addCircleMarkers(data = public, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "Public") %>%
addCircleMarkers(data = private, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "Private") %>%
addCircleMarkers(data = profit, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "For-Profit") %>%
addLayersControl(baseGroups = c("OSM", "Carto", "Esri"), overlayGroups =
c("Public", "Private", "For-Profit")) %>%
setView(lat = 39.8282, lng = -98.5795, zoom = 4)
# Make each sector of colleges searchable
m4_search <- m4 %>%
addSearchFeatures(targetGroups = c('Public', 'Private', 'For-Profit'), options =
searchFeaturesOptions(zoom = 18))

ipeds %>%
leaflet() %>% addTiles() %>%
# Sanitize any html in our labels # Color colleges by sector using the
`pal` palette # Cluster all colleges using `clusterOptions`
addCircleMarkers(radius = 2, label = ~htmlEscape(name), color =
~pal(sector_label), clusterOptions = markerClusterOptions())

## POLYGONS

class(shp)
slotNames(shp)
# Glimpse the data
glimpse(shp@data)
glimpse(nc_income)

# Summarize the nc_income data


summary(nc_income)

# Left join nc_income onto shp@data and store in shp_nc_income


shp_nc_income <- shp@data %>% left_join(nc_income, by = c("GEOID10" = "zipcode"))

# Print the number of missing values of each variable in shp_nc_income


shp_nc_income %>% summarize_all(funs(sum(is.na(.))))

# map the polygons in shp


shp %>%
leaflet() %>% addTiles() %>% addPolygons()

# which zips were not in the income data?


shp_na <- shp[is.na(shp$mean_income),]

# map the polygons in shp_na


shp_na%>% leaflet() %>% addTiles() %>% addPolygons()

# summarize the mean income variable


summary(shp$mean_income)

# subset shp to include only zip codes in the top quartile of mean income
high_inc <- shp[!is.na(shp$mean_income) & shp$mean_income > 55917,]

# map the boundaries of the zip codes in the top quartile of mean income
high_inc %>%
leaflet() %>% addTiles() %>% addPolygons()

nc_pal <- colorNumeric("YlGn", domain = high_inc@data$mean_income)

high_inc %>%
leaflet() %>% addTiles() %>%
# set boundary thickness to 1 and color polygons # add labels that display
mean income # highlight polygons on hover
addPolygons(weight = 1, color = ~nc_pal(mean_income),
label = ~paste0("Mean Income: ", dollar(mean_income)),
highlight = highlightOptions(weight = 5, color = "white",
bringToFront = TRUE))

# Use the log!!! function to create a new version of nc_pal


nc_pal <- colorNumeric("YlGn", domain = log(high_inc@data$mean_income))

# comment out the map tile


high_inc %>%
leaflet() %>%
# addProviderTiles("CartoDB") %>%
# apply the new nc_pal to the map log!!!
addPolygons(weight = 1, color = ~nc_pal(log(mean_income)), fillOpacity = 1,
label = ~paste0("Mean Income: ", dollar(mean_income)),
highlightOptions = highlightOptions(weight = 5, color = "white",
bringToFront = TRUE))

wealthy_zips %>%
leaflet() %>%
addProviderTiles("CartoDB") %>%
# set color to green and create Wealth Zipcodes group
addPolygons(weight = 1, fillOpacity = .7, color = "green", group = "Wealthy
Zipcodes",
label = ~paste0("Mean Income: ", dollar(mean_income)),
highlightOptions = highlightOptions(weight = 5, color = "white",
bringToFront = TRUE))

# Add polygons using wealthy_zips


final_map <- m4 %>%
addPolygons(data = wealthy_zips, weight = 1, fillOpacity = .5, color = "Grey",
group = "Wealthy Zip Codes",
label = ~paste0("Mean Income: ", dollar(mean_income)),
highlight = highlightOptions(weight = 5, color = "white",
bringToFront = TRUE)) %>%
# Update layer controls including "Wealthy Zip Codes"
addLayersControl(baseGroups = c("OSM", "Carto", "Esri"),
overlayGroups = c("Public", "Private", "For-Profit", "Wealthy Zip Codes"))

################

library(sp)
library(rgdal)

dir()
dir("nynta_16c")
# Read in shapefile with readOGR(): neighborhoods
neighborhoods <- readOGR("nynta_16c","nynta")

# neighborhoods
summary(neighborhoods)
plot(neighborhoods)

library(raster)

dir("nyc_grid_data")
# Use raster() with file path: income_grid
income_grid <- raster("nyc_grid_data/m5602ahhi00.tif")

summary(income_grid)
plot(income_grid)

library(tigris)

# Call tracts(): nyc_tracts, cb = T lower resolutions


nyc_tracts <- tracts(state = "NY", county = "New York", cb = T)

summary(nyc_tracts)
plot(nyc_tracts)

library(sp)

# proj4string() on nyc_tracts and neighborhoods


proj4string(nyc_tracts)
proj4string(neighborhoods)

> proj4string(nyc_tracts)
[1] "+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0"
> proj4string(neighborhoods)
[1] "+proj=lcc +lat_1=40.66666666666666 +lat_2=41.03333333333333
+lat_0=40.16666666666666 +lon_0=-74 +x_0=300000 +y_0=0 +datum=NAD83 +units=us-ft
+no_defs +ellps=GRS80 +towgs84=0,0,0"

# coordinates() on nyc_tracts and neighborhoods


coordinates(nyc_tracts)
coordinates(neighborhoods)

# plot() neighborhoods and nyc_tracts


plot(neighborhoods)
plot(nyc_tracts, col = 'red', add = T)

## We didn't see the tracts on our plot of neighborhoods because the coordinates of
the tracts put them way off the boundaries of our plot.

library(sp)
library(raster)

# Use spTransform on neighborhoods: neighborhoods


neighborhoods <- spTransform(neighborhoods, CRS = proj4string(nyc_tracts) )

# head() on coordinates() of neighborhoods


head(coordinates(neighborhoods) )

# Plot neighborhoods, nyc_tracts and water


plot(neighborhoods)
plot(nyc_tracts, add = T, col = 'red')
plot(water, add = T, col = 'blue')

## If you plot the untransformed objects with tmap, it actually transforms on the
fly, but it's useful to know how to do it manually.

library(sp)
# Use str() on nyc_income # ...and on nyc_tracts@data
str(nyc_income)
str(nyc_tracts@data)

# Highlight tract 002201 in nyc_tracts


plot(nyc_tracts)
plot(nyc_tracts[nyc_tracts$TRACTCE == "002201", ], col = "red", add = TRUE)

# Set nyc_tracts@data to nyc_income


nyc_tracts@data <- nyc_income

# Highlight tract 002201 again


plot(nyc_tracts)
plot(nyc_tracts[nyc_tracts$tract == "002201", ], col = "red", add = TRUE)

# Check for duplicates in nyc_income


any(duplicated(nyc_income$tract ) )

# Check for duplicates in nyc_tracts


any(duplicated(nyc_tracts$TRACTCE ) )

# Check nyc_tracts in nyc_income


all(nyc_tracts$TRACTCE %in% nyc_income$tract)

# Check nyc_income in nyc_tracts


all(nyc_income$tract %in% nyc_tracts$TRACTCE)

library(sp)
library(tmap)

# Merge nyc_tracts and nyc_income: nyc_tracts_merge


nyc_tracts_merge <- merge(nyc_tracts, nyc_income, by.x = "TRACTCE", by.y = "tract")

# Choropleth with col mapped to estimate << watch "estimate"


tm_shape(nyc_tracts_merge) + tm_fill(col = "estimate")

library(tmap)
## CAREFUL WITH """
tm_shape(nyc_tracts_merge) + tm_fill(col = "estimate") +
tm_shape(water) + tm_fill(col = "grey90") +
tm_shape(neighborhoods) + tm_borders()

library(tmap)

# Find unique() nyc_tracts_merge$COUNTYFP


unique(nyc_tracts_merge$COUNTYFP)

# Add logical expression to pull out New York County


manhat_hoods <- neighborhoods[neighborhoods$CountyFIPS == '061', ]
tm_shape(nyc_tracts_merge) +
tm_fill(col = "estimate") +
tm_shape(water) +
tm_fill(col = "grey90") +
tm_shape(manhat_hoods) +
tm_borders() +
tm_text(text = 'NTAName')

library(tmap)

# gsub() to replace " " with "\n"


manhat_hoods$name <- gsub(" ","\n",manhat_hoods$NTAName)
# gsub() to replace "-" with "/\n"
manhat_hoods$name <- gsub("-","/\n",manhat_hoods$name)

# Edit to map text to name, set size to 0.5


tm_shape(nyc_tracts_merge) + tm_fill(col = "estimate") +
tm_shape(water) + tm_fill(col = "grey90") +
tm_shape(manhat_hoods) +
tm_borders() +
tm_text(text = "name", size = .5)

library(tmap)

# Add title and change palette # tm_borders() # neighborhood boundaries #


tm_credits()

tm_shape(nyc_tracts_merge) +
tm_fill(col = "estimate", title = 'Median Income', palette = 'Greens') +
tm_borders(col = 'grey60', lwd = .5) +
tm_shape(water) +
tm_fill(col = "grey90") +
tm_shape(manhat_hoods) +
tm_borders(col = 'grey40', lwd = 2 ) +
tm_text(text = "name", size = 0.5) +
tm_credits("Source: ACS 2014 5-year Estimates, \n accessed via acs package",
position = c("right", "bottom"))

save_tmap(filename = "nyc_income_map.png", width = 4, height = 7)

######
ls.str() ## list variables

n <- 300
x <- runif(n, 0, 1)
y <- runif(n, 0, 2)

mapxy <- function(a = NA){


plot(x, y, asp = a)
rect(xmin, ymin, xmax, ymax)
}

mapxy(1)

# Load the spatstat package


library(spatstat)

Quadrat count test for uniformity


Quadrat count tests are implemented using quadrat.test(), which takes a planar
point pattern, ppp() object. "Planar point pattern" is jargon for a set of points
in a region of a 2D plane.

Creating a uniform point pattern with spatstat


A Poisson point process creates events according to a Poisson distribution with an
intensity parameter specifying the expected events per unit area. The total number
of events generated is a single number from a Poisson distribution, so multiple
realisations of the same process can easily have different numbers of events.
The spatstat package can generate Poisson spatial processes with the rpoispp()
function given an intensity and a window, that are not conditioned on the total.
Just as the random number generator functions in R start with an "r", most of the
random point-pattern functions in spatstat start with an "r". The area() function
of spatstat will compute the area of a window such as a disc.

Simulating clustered and inhibitory patterns


The spatstat package also has functions for generating point patterns from other
process models. These generally fall into one of two classes: clustered processes,
where points occur together more than under a uniform Poisson process, and regular
(aka inhibitory) processes where points are more spaced apart than under a uniform
intensity Poisson process. Some process models can generate patterns on a continuum
from clustered through uniform to regular depending on their parameters.

The quadrat.test() function can test against clustered or regular alternative


hypotheses. By default it tests against either of those, but this can be changed
with the alternative parameter to create a one-sided test.

A Thomas process is a clustered pattern where a number of "parent" points,


uniformly distributed, create a number of "child" points in their neighborhood. The
child points themselves form the pattern. This is an attractive point pattern, and
makes sense for modeling things like trees, since new trees will grow near the
original tree. Random Thomas point patterns can be generated using rThomas(). This
takes three numbers that determine the intensity and clustering of the points, and
a window object.

Conversely the points of a Strauss process cause a lowering in the probability of


finding another point nearby. The parameters of a Strauss process can be such that
it is a "hard-core" process, where no two points can be closer than a set
threshold. Creating points from this process involves some clever simulation
algorithms. This is a repulsive point pattern, and makes sense for modeling things
like territorial animals, since the other animals of that species will avoid the
territory of a given animal. Random Strauss point patterns can be generated using
rStrauss(). This takes three numbers that determine the intensity and "territory"
of the points, and a window object. Points generated by a Strauss process are
sometimes called regularly spaced.

Nearest-neighbor distributions
Another way of assessing clustering and regularity is to consider each point, and
how it relates to the other points. One simple measure is the distribution of the
distances from each point to its nearest neighbor.

The nndist() function in spatstat takes a point pattern and for each point returns
the distance to its nearest neighbor. You can then plot the histogram.

Instead of working with the nearest-neighbor density, as seen in the histogram, it


can be easier to work with the cumulative distribution function, G(r). This is the
probability of a point having a nearest neighbour within a distance r.

For a uniform Poisson process, G can be computed theoretically, and is G(r) = 1 -


exp( - lambda * pi * r ^ 2). You can compute G empirically from your data using
Gest() and so compare with the theoretical value.
Events near the edge of the window might have had a nearest neighbor outside the
window, and so unobserved. This will make the distance to its observed nearest
neighbor larger than expected, biasing the estimate of G. There are several methods
for correcting this bias.

Plotting the output from Gest shows the theoretical cumulative distribution and
several estimates of the cumulative distribution using different edge corrections.
Often these edge corrections are almost indistinguishable, and the lines overlap.
The plot can be used as a quick exploratory test of complete spatial randomness.

Other point pattern distribution functions


A number of other functions of point patterns have been developed. They are
conventionally denoted by various capital letters, including F, H, J, K and L.

The K-function is defined as the expected number of points within a distance of a


point of the process, scaled by the intensity. Like G, this can be computed
theoretically for a uniform Poisson process and is K(r) = pi * r ^ 2 - the area of
a circle of that radius. Deviation from pi * r ^ 2 can indicate clustering or point
inhibition.

Computational estimates of K(r) are done using the Kest() function.

As with G calculations, K-function calculations also need edge corrections. The


default edge correction in spatstat is generally the best, but can be slow, so
we'll use the "border" correction for speed here.

Uncertainties on K-function estimates can be assessed by randomly sampling points


from a uniform Poisson process in the area and computing the K-function of the
simulated data. Repeat this process 99 times, and take the minimum and maximum
value of K over each of the distance values. This gives an envelope - if the K-
function from the data goes above the top of the envelope then we have evidence for
clustering. If the K-function goes below the envelope then there is evidence for an
inhibitory process causing points to be spaced out. Envelopes can be computed using
the envelope() function.

The plot method for estimates of K uses a formula system where a dot on the left of
a formula refers to K(r). So the default plot uses . ~ r. You can compare the
estimate of K to a Poisson process by plotting . - pi * r ^ 2 ~ r. If the data was
generated by a Poisson process, then the line should be close to zero for all
values of r.

n_points <- 300


radius <- 10
# Generate uniform random numbers up to radius-squared
r_squared <- runif(n_points, 0, radius^2)
angle <- runif(n_points, 0, 2*pi)

# Take the square root of the values to get a uniform spatial distribution
x <- sqrt(r_squared) * cos(angle)
y <- sqrt(r_squared) * sin(angle)

plot(disc(radius)); points(x, y)
## spatstat::disc() plot a disc
# Set coordinates and window
ppxy <- ppp(x = x, y = y, window = disc(radius) )

# Test the point pattern


qt <- quadrat.test(ppxy)

# Inspect the results


plot(qt)
print(qt)

# Create a disc of radius 10


disc10 <- disc(10)
# Compute the rate as count divided by area
lambda <- 500 / area(disc10)
# Create a point pattern object
ppois <- rpoispp(lambda = lambda, win = disc10)
# Plot the Poisson point pattern
plot(ppois)

# Generate clustered points from a Thomas process


set.seed(123)
p_cluster <- rThomas(kappa = 0.35, scale = 1, mu = 3, win = disc10)
plot(p_cluster)
quadrat.test(p_cluster, alternative = "clustered")

# Regular points from a Strauss process


set.seed(123)
p_regular <- rStrauss(beta = 2.9, gamma = 0.025, R = .5, W = disc10)
plot(p_regular)
quadrat.test(p_regular, alternative = "regular")

# Point patterns are pre-defined


p_poisson; p_regular

# Calc nearest-neighbor distances for Poisson point data


nnd_poisson <- nndist(p_poisson)

# Draw a histogram of nearest-neighbor distances


hist(nnd_poisson)

# Estimate G(r)
G_poisson <- Gest(p_poisson)

# Plot G(r) vs. r


plot(G_poisson)

# Repeat for regular point data


nnd_regular <- nndist(p_regular)
hist(nnd_regular)
G_regular <- Gest(p_regular)
plot(G_regular)

# Point patterns are pre-defined


p_poisson; p_cluster; p_regular
# Estimate the K-function for the Poisson points
K_poisson <- Kest(p_poisson, correction = "border")

# The default plot shows quadratic growth


plot(K_poisson, . ~ r)

# Subtract pi * r ^ 2 from the Y-axis and plot


plot(K_poisson, . - pi*r^2 ~ r)

# Compute envelopes of K under random locations


K_cluster_env <- envelope(p_cluster, Kest, correction = "border")

# Insert the full formula to plot K minus pi * r^2


plot(K_cluster_env, . - pi * r ^ 2 ~ r)

# Repeat for regular data


K_regular_env <- envelope(p_regular, Kest, correction = 'border')
plot(K_regular_env, . - pi * r ^ 2 ~ r)

# Load the spatstat package


library(spatstat)

# Get some summary information on the dataset


summary(preston_crime)

# Get a table of marks


table(marks(preston_crime))

# Define a function to create a map


preston_map <- function(cols = c("green","red"), cex = c(1, 1), pch = c(1, 1)) {
plotRGB(preston_osm) # from the raster package
plot(preston_crime, cols = cols, pch = pch, cex = cex, add = TRUE, show.window =
TRUE)
}

# Draw the map with colors, sizes and plot character


preston_map(
cols = c("black", "red"),
cex = c(0.5, 1),
pch = c(19,19)
)

# preston_crime has been pre-defined


preston_crime

# Use the split function to show the two point patterns


crime_splits <- split(preston_crime)

# Plot the split crime


plot(crime_splits)

# Compute the densities of both sets of points


crime_densities <- density(crime_splits)

# Calc the violent density divided by the sum of both


frac_violent_crime_density <- crime_densities[[2]] /
(crime_densities[[1]] + crime_densities[[2]])

# Plot the density of the fraction of violent crime


plot(frac_violent_crime_density)

## Bandwidth selection
## We can get a more principled measure of the violent crime ratio using a spatial
segregation model. The spatialkernel package implements the theory of spatial
segregation. The first step is to compute the optimal bandwidth for kernel
smoothing under the segregation model. A small bandwidth would result in a density
that is mostly zero, with spikes at the event locations. A large bandwidth would
flatten out any structure in the events, resulting in a large "blob" across the
whole window. Somewhere between these extremes is a bandwidth that best represents
an underlying density for the process.

## spseg() will scan over a range of bandwidths and compute a test statistic using
a cross-validation method. The bandwidth that maximizes this test statistic is the
one to use. The returned value from spseg() in this case is a list, with h and cv
elements giving the values of the statistic over the input h values. The
spatialkernel package supplies a plotcv function to show how the test value varies.
The hcv element has the value of the best bandwidth.

# Scan from 500m to 1000m in steps of 50m


bw_choice <- spseg(
preston_crime,
h = seq(500, 1000, by = 50),
opt = 1)

# Plot the results and highlight the best bandwidth


plotcv(bw_choice); abline(v = bw_choice$hcv, lty = 2, col = "red")

# Print the best bandwidth


print(bw_choice$hcv)

Segregation probabilities

The second step is to compute the probabilities for violent and non-violent crimes
as a smooth surface, as well as the p-values for a point-wise test of segregation.
This is done by calling spseg() with opt = 3 and a fixed bandwidth parameter h.

Normally you would run this process for at least 100 simulations, but that will
take too long to run here. Instead, run for only 10 simulations. Then you can use a
pre-loaded object seg which is the output from a 1000 simulation run that took
about 20 minutes to complete.

# Set the correct bandwidth and run for 10 simulations only


seg10 <- spseg(
pts = preston_crime,
h = bw_choice$hcv,
opt = 3,
ntest = 10,
proc = FALSE)
# Plot the segregation map for violent crime
plotmc(seg10, "Violent crime")
# Plot seg, the result of running 1000 simulations
plotmc(seg, "Violent crime")

Mapping segregation
With a base map and some image and contour functions we can display both the
probabilities and the significance tests over the area with more control than the
plotmc() function.

The seg object is a list with several components. The X and Y coordinates of the
grid are stored in the $gridx and $gridy elements. The probabilities of each class
of data (violent or non-violent crime) are in a matrix element $p with a column for
each class. The p-value of the significance test is in a similar matrix element
called $stpvalue. Rearranging columns of these matrices into a grid of values can
be done with R's matrix() function. From there you can construct list objects with
a vector $x of X-coordinates, $y of Y-coordinates, and $z as the matrix. You can
then feed this to image() or contour() for visualization.

This process may seem complex, but remember that with R you can always write
functions to perform complex tasks and those you may repeat often. For example, to
help with the mapping in this exercise you will create a function that builds a map
from four different items.

The seg object from 1000 simulations is loaded, as well as the preston_crime points
and the preston_osm map image.

# Inspect the structure of the spatial segregation object


str(seg)

# Get the number of columns in the data so we can rearrange to a grid


ncol <- length(seg$gridx)

# Rearrange the probability column into a grid


prob_violent <- list(x = seg$gridx,
y = seg$gridy,
z = matrix(seg$p[, "Violent crime"],
ncol = ncol))
image(prob_violent)

# Rearrange the p-values, but choose a p-value threshold


p_value <- list(x = seg$gridx,
y = seg$gridy,
z = matrix(seg$stpvalue[, "Violent crime"] < 0.05,
ncol = ncol))
image(p_value)

# Create a mapping function


segmap <- function(prob_list, pv_list, low, high){

# background map
plotRGB(preston_osm)

# p-value areas
image(pv_list,
col = c("#00000000", "#FF808080"), add = TRUE)

# probability contours
contour(prob_list,
levels = c(low, high),
col = c("#206020", "red"),
labels = c("Low", "High"),
add = TRUE)

# boundary window
plot(Window(preston_crime), add = TRUE)
}

# Map the probability and p-value


segmap(prob_violent, p_value, .05, .15)

# Get a quick summary of the dataset


summary(sasq)

# Plot unmarked points


plot(unmark(sasq))

# Plot the points using a circle sized by date


plot(sasq, which.marks = "date")

Temporal pattern of bigfoot sightings


Having established that there is some spatial clustering going on, you need to
explore the temporal behavior. Are the number of sightings increasing? Decreasing?
Does the rate vary over the course of a year ("seasonality")? Does the spatial
pattern change much over the course of a year?

The base R hist() function has a method for dates that lets you specify a time unit
for the breaks. You pass a string to the breaks argument, such as "days", "weeks",
"months", "quarters" or "years".

# Show the available marks


names(marks(sasq))

# Histogram the dates of the sightings, grouped by year


hist(marks(sasq)$date, breaks = "years", freq = TRUE)

# Plot and tabulate the calendar month of all the sightings


plot(table(marks(sasq)$month))

# Split on the month mark


sasq_by_month <- split(sasq, "month", un = TRUE)

# Plot monthly maps


plot(sasq_by_month)

# Plot smoothed versions of the above split maps


plot(density(sasq_by_month))

# Get a matrix of event coordinates


sasq_xy <- as.matrix(coords(sasq))

# Check the matrix has two columns


dim(sasq_xy)
# Get a vector of event times
sasq_t <- marks(sasq)$date

# Extract a two-column matrix from the ppp object


sasq_poly <- as.matrix(as.data.frame(Window(sasq)))
dim(sasq_poly)

# Set the time limit to 1 day before and 1 day after the range of times
tlimits <- range(sasq_t) + c(-1, 1)

# Scan over 400m intervals from 100m to 20km


s <- seq(100, 20000, by = 400)

# Scan over 14 day intervals from one week to 31 weeks


tm <- seq(1*7, 31*7, by = 14)

# Run 999 simulations


sasq_mc <- stmctest(sasq_xy, sasq_t, sasq_poly, tlimits, s, tm, nsim = 999, quiet =
TRUE)
names(sasq_mc)

# Histogram the simulated statistics and add a line at the data value
ggplot(data.frame(sasq_mc), aes(x = t)) +
geom_histogram(binwidth = 1e13) +
geom_vline(aes(xintercept = t0))

# Compute the p-value as the proportion of tests greater than the data
sum(sasq_mc$t > sasq_mc$t0) / 1000

London EU referendum data


In 2016 the UK held a public vote on whether to remain in the European Union. The
results of the referendum, where people voted either "Remain" or "Leave", are
available online. The data set london_ref contains the results for the 32 boroughs
of London, and includes the number and percentage of votes in each category as well
as the count of spoilt votes, the population size and the electorate size.

The london_ref object is a SpatialPolygonsDataFrame, a special kind of data frame


where each row also has the shape of the borough. It behaves like a data frame in
many respects, but can also be used to plot a choropleth, or shaded polygon, map.
NAME : the name of the borough.
Electorate : the total number of people who can vote.
Remain, Leave : the number of votes for "Remain" or "Leave".
Pct_Remain, Pct_Leave : the percentage of votes for each side.
spplot() from the raster package provides a convenient way to draw a shaded map of
regions

# See what information we have for each borough


summary(london_ref)
# Which boroughs voted to "Leave"?
london_ref$NAME[london_ref$Leave > london_ref$Remain]
# Plot a map of the percentage that voted "Remain"
spplot(london_ref, zcol = "Pct_Remain")

Cartogram
Large areas, such as cities or countries, are often divided into smaller
administrative units, often into zones of approximately equal population. But the
area of those units may vary considerably. When mapping them, the large areas carry
more visual "weight" than small areas, although just as many people live in the
small areas.
One technique for correcting for this is the cartogram. This is a controlled
distortion of the regions, expanding some and contracting others, so that the area
of each region is proportional to a desired quantity, such as the population. The
cartogram also tries to maintain the correct geography as much as possible, by
keeping regions in roughly the same place relative to each other.
The cartogram package contains functions for creating cartograms. You give it a
spatial data frame and the name of a column, and you get back a similar data frame
but with regions distorted so that the region area is proportional to the column
value of the regions.

You'll also use the rgeos package for computing the areas of individual regions
with the gArea() function.

# Use the cartogram and rgeos packages


library(cartogram)
library(rgeos)

# Make a scatterplot of electorate vs borough area


names(london_ref)
plot(london_ref$Electorate, gArea(london_ref, byid = TRUE))

# Make a cartogram, scaling the area to the electorate


carto_ref <- cartogram(london_ref, "Electorate")
plot(carto_ref)

# Check the linearity of the electorate-area plot


plot(carto_ref$Electorate, gArea(carto_ref, byid = TRUE))

# Make a fairer map of the Remain percentage


spplot(carto_ref, "Pct_Remain")

Spatial autocorrelation test


The map of "Remain" votes seems to have spatial correlation. Pick any two boroughs
that are neighbors - with a shared border - and the chances are they'll be more
similar than any two random boroughs. This can be a problem when using statistical
models that assume, conditional on the model, that the data points are independent.

The spdep package has functions for measures of spatial correlation, also known as
spatial dependency. Computing these measures first requires you to work out which
regions are neighbors via the poly2nb() function, short for "polygons to
neighbors". The result is an object of class nb. Then you can compute the test
statistic and run a significance test on the null hypothesis of no spatial
correlation. The significance test can either be done by Monte-Carlo or theoretical
models.

In this example you'll use the Moran "I" statistic to test the spatial correlation
of the population and the percentage "Remain" vote.

# Use the spdep package


library(spdep)
# Make neighbor list
borough_nb <- poly2nb(london_ref)

# Get center points of each borough


borough_centers <- coordinates(london_ref)

# Show the connections


plot(london_ref); plot(borough_nb, borough_centers, add = T)

# Map the total pop'n


spplot(london_ref, zcol = "TOTAL_POP")

# Run a Moran I test on total pop'n


moran.test( london_ref$TOTAL_POP, nb2listw(borough_nb))

# Map % Remain
spplot(london_ref, zcol = "Pct_Remain")

# Run a Moran I MC test on % Remain


moran.mc(london_ref$Pct_Remain, nb2listw(borough_nb), nsim = 999)

>> You should have found that the p-value was around 0.1 in the first case, thus
you did not find any significant spatial correlation. In the second case, the p-
value was around 0.001, so you did find some significant spatial correlation.

# Get a summary of the data set


summary(london)

# Map the OBServed number of flu reports


spplot(london, "Flu_OBS")

# Compute and print the overall incidence of flu


r <- sum(london$Flu_OBS) / sum(london$TOTAL_POP)
r

# Calculate the expected number for each borough


london$Flu_EXP <- london$TOTAL_POP * r

# Calculate the ratio of OBServed to EXPected


london$Flu_SMR <- london$Flu_OBS / london$Flu_EXP

# Map the SMR


spplot(london, "Flu_SMR")

# For the binomial statistics function


library(epitools)

# Get CI from binomial distribution


flu_ci <- binom.exact(london$Flu_OBS, london$TOTAL_POP)

# Add borough names


flu_ci$NAME <- london$NAME

# Calculate London rate, then compute SMR


r <- sum(london$Flu_OBS) / sum(london$TOTAL_POP)
flu_ci$SMR <- flu_ci$proportion / r
# Subset the high SMR data
flu_high <- flu_ci[flu_ci$SMR > 1, ]

# Plot estimates with CIs


library(ggplot2)
ggplot(flu_high, aes(x = NAME, y = proportion / r,
ymin = lower / r, ymax = upper / r)) +
geom_pointrange() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Exceedence probabilities
Distributions and confidence intervals can be difficult things to present to non-
statisticians. An alternative is to present a probability that a value is over a
threshold. For example, public health teams might be interested in when an SMR has
more than doubled, and as a statistician you can give a probability that this has
happened. Then the public health team might decide to go to some alert level when
the probability of a doubling of SMR is over 0.95.

Again, the properties of the binomial distribution let you compute this for
proportional data. You can then map these exceedence probabilities for some
threshold, and use a sensible color scheme to highlight probabilities close to 1.

# Probability of a binomial exceeding a multiple


binom.exceed <- function(observed, population, expected, e){
1 - pbinom(e * expected, population, prob = observed / population)
}

# Compute P(rate > 2)


london$Flu_gt_2 <- binom.exceed(
observed = london$Flu_OBS,
population = london$TOTAL_POP,
expected = london$Flu_EXP,
e = 2)

# Use a 50-color palette that only starts changing at around 0.9


pal <- c(
rep("#B0D0B0", 40),
colorRampPalette(c("#B0D0B0", "orange"))(5),
colorRampPalette(c("orange", "red"))(5)
)

# Plot the P(rate > 2) map


spplot(london, "Flu_gt_2", col.regions = pal, at = seq(0, 1, len = 50))

A Poisson GLM
A Poisson generalized linear model is a way of fitting count data to explanatory
variables. You get out parameter estimates and standard errors for your explanatory
variables, and can get fitted values and residuals.

The glm() function fits Poisson GLMs. It works just like the lm() function, but you
also specify a family argument. The formula has the usual meaning - response on the
left of the ~, and explanatory variables on the right.

To cope with count data coming from populations of different sizes, you specify an
offset argument. This adds a constant term for each row of the data in the model.
The log of the population is used in the offset term.
# Fit a poisson GLM.
model_flu <- glm(data = london, Flu_OBS ~ HealthDeprivation, offset =
log(TOTAL_POP), family = poisson)

# Is HealthDeprivation significant?
summary(model_flu)

# Put residuals into the spatial data.


london$Flu_Resid <- residuals(model_flu)

# Map the residuals using spplot


spplot(london, "Flu_Resid")

Residuals
A linear model should fit the data and leave uncorrelated residuals. This applies
to non-spatial models, where, for example, fitting a straight line through points
on a curve would lead to serially-correlated residuals. A model on spatial data
should aim to have residuals that show no significant spatial correlation.

You can test the model fitted to the flu data using moran.mc() from the spdep
package. Monte Carlo Moran tests were previously discussed in the Spatial
autocorrelation test exercise earlier in the chapter.

# Compute the neighborhood structure.


library(spdep)
borough_nb <- poly2nb(london)

# Test spatial correlation of the residuals.


moran.mc(london$Flu_Resid, listw = nb2listw(borough_nb), nsim = 999)

Fit a Bayesian GLM


Bayesian statistical models return samples of the parameters of interest (the
"posterior" distribution) based on some "prior" distribution which is then updated
by the data. The Bayesian modeling process returns a number of samples from which
you can compute the mean, or an exceedence probability, or any other quantity you
might compute from a distribution.

Before you fit a model with spatial correlation, you'll first fit the same model as
before, but using Bayesian inference.

# Use R2BayesX
library(R2BayesX)

# Fit a GLM
model_flu <- glm(Flu_OBS ~ HealthDeprivation, offset = log(TOTAL_POP),
data = london, family = poisson)

# Summarize it
summary(model_flu)

# Calculate coeff confidence intervals


confint(model_flu)

# Fit a Bayesian GLM


bayes_flu <- bayesx(Flu_OBS ~ HealthDeprivation, offset = log(london$TOTAL_POP),
family = "poisson", data = as.data.frame(london),
control = bayesx.control(seed = 17610407))

# Summarize it
summary(bayes_flu)

# Look at the samples from the Bayesian model


plot(samples(bayes_flu))

Adding a spatially autocorrelated effect


You've fitted a non-spatial GLM with BayesX. You can include a spatially correlated
term based on the adjacency structure by adding a term to the formula specifying a
spatially correlated model.

The spatial data object, london is already loaded.

Use poly2nb() to compute the neighborhood structure of london to an nb object.


R2BayesX uses its own objects for the adjacency. Convert the nb object to a gra
object.
The sx function specifies additional terms to bayesx. Create a term using the
"spatial" basis and the gra object for the boroughs to define the map.
Print a summary of the model object. You should see a table of coefficients for the
parametric part of the model as in the previous exercise, and then a table of
"Smooth terms variance" with one row for the spatial term. Note that since BayesX
can fit many different forms in its sx terms, most of which, like the spatial model
here, cannot be simply expressed with a parameter or two. This table shows the
variance of the random effects - for further explanation consult a text on random
effects modeling.

# Compute adjacency objects


borough_nb <- poly2nb(london)
borough_gra <- nb2gra(borough_nb)

# Fit spatial model


flu_spatial <- bayesx(
Flu_OBS ~ HealthDeprivation + sx(i, bs = "spatial", map = borough_gra),
offset = log(london$TOTAL_POP),
family = "poisson", data = data.frame(london),
control = bayesx.control(seed = 17610407)
)

# Summarize the model


summary(flu_spatial)

Mapping the spatial effects


As with glm, you can get the fitted values and residuals from your model using the
fitted and residuals functions. bayesx models are a bit more complex, since you
have the linear predictor and terms from sx elements, such as the spatially
correlated term.

The summary function will show you information for the linear model terms and the
smoothing terms in two separate tables. The spatial term is called "sx(i):mrf" -
standing for "Markov Random Field".

Bayesian analysis returns samples from a distribution for our S(x) term at each of
the London boroughs. The fitted function from bayesx models returns summary
statistics for each borough. You'll just look at the mean of that distribution for
now.

# Summarize the model


summary(flu_spatial)

# Map the fitted spatial term only


london$spatial <- fitted(flu_spatial, term = "sx(i):mrf")[, "Mean"]
spplot(london, zcol = "spatial")

# Map the residuals


london$spatial_resid <- residuals(flu_spatial)[, "mu"]
spplot(london, zcol = "spatial_resid")

# Test residuals for spatial correlation


moran.mc(london$spatial_resid, nb2listw(borough_nb), 999)

Canadian geochemical survey data


Your job is to study the acidity (pH) of some Canadian survey data. The survey
measurements are loaded into a spatial data object called ca_geo.

The acidity survey data, ca_geo has been pre-defined.

Look at the names of columns in the data and get a summary of the numerical pH
values. You should notice there are some missing values (NA's). Make a histogram of
the acidity.
Construct a vector that is TRUE for the rows with missing pH values. You should
have 33.
Plot a map of the survey data. You need to subset the data to remove the missing
values. The spplot() function needs a column name in quotes to map that data.

# ca_geo has been pre-defined


str(ca_geo, 1)

# See what measurements are at each location


names(ca_geo)

# Get a summary of the acidity (pH) values


summary(ca_geo$pH)

# Look at the distribution


hist(ca_geo$pH)

# Make a vector that is TRUE for the missing data


miss <- is.na(ca_geo$pH)
table(miss)

# Plot a map of acidity


spplot(ca_geo[!miss, ], "pH")
Fitting a trend surface
The acidity data shows pH broadly increasing from north-east to south-west. Fitting
a linear model with the coordinates as covariates will interpolate a flat plane
through the values.

Instructions
100 XP
The acidity survey data, ca_geo has been pre-defined.

The response, on the left of the ~ sign, is the name of the column we are modeling.
The explanatory variables are on the right of the ~ sign, separated by a + sign,
and are the names of the coordinate columns obtained by coordnames().
Fit the model and see if the model parameters are significant by seeing stars in
the coefficients table.

# ca_geo has been pre-defined


str(ca_geo, 1)

# Are they called lat-long, up-down, or what?


coordnames(ca_geo)

# Complete the formula


m_trend <- lm(pH ~ x + y, as.data.frame(ca_geo))

# Check the coefficients


summary(m_trend)

Predicting from a trend surface


Your next task is to compute the pH at the locations that have missing data in the
source. You can use the predict() function on the fitted model from the previous
exercise for this.

The acidity survey data, ca_geo, and the linear model, m_trend have been pre-
defined.
Construct a vector that is TRUE for the rows with missing pH values.
Take a subset of the data wherever the pH is missing, assigning the result to
ca_geo_miss.
By default predict() will return predictions at all the original locations.
Pass the model as the first argument, as usual.
Pass ca_geo_miss to the newdata argument to predict missing values.
Assign the result to predictions.
Alkaline soils are those with a pH over 7. Our linear model gives us estimates and
standard deviation based on a normal (Gaussian) assumption. Compute the probability
of the soil being over 7 using pnorm() with the mean and standard deviation values
from the prediction data.

# ca_geo, miss, m_trend have been pre-defined


ls.str()
# Make a vector that is TRUE for the missing data
miss <- is.na(ca_geo$pH)
# Create a data frame of missing data
ca_geo_miss <- as.data.frame(ca_geo)[miss, ]
# Predict pH for the missing data
predictions <- predict(m_trend, newdata = ca_geo_miss, se.fit = TRUE)
# Compute the exceedence probability
pAlkaline <- 1 - pnorm(7, mean = predictions$fit, sd = predictions$se.fit)
hist(pAlkaline)

Variogram estimation
You can use the gstat package to plot variogram clouds and the variograms from
data. Recall:

The variogram cloud shows the differences of the measurements against distance for
all pairs of data points.
The binned variogram divides the cloud into distance bins and computes the average
difference within each bin.
The y-range of the binned variogram is always much smaller than the variogram cloud
because the cloud includes the full range of values that go into computing the mean
for the binned variogram.

# ca_geo, miss have been pre-defined


ls.str()

# Make a cloud from the non-missing data up to 10km


plot(variogram(pH ~ 1, ca_geo[!miss, ], cloud = TRUE, cutoff = 10000))

# Make a variogram of the non-missing data


plot(variogram(pH ~ 1, ca_geo[!miss, ]))

Variogram with spatial trend


You might imagine that if soil at a particular point is alkaline, then soil one
metre away is likely to be alkaline too. But can you say the same thing about soil
one kilometre away, or ten kilometres, or one hundred kilometres?

The shape of the previous variogram tells you there is a large-scale trend in the
data. You can fit a variogram considering this trend with gstat. This variogram
should flatten out, indicating there is no more spatial correlation after a certain
distance with the trend taken into account.

# ca_geo, miss have been pre-defined


ls.str()

# See what coordinates are called


coordnames(ca_geo)

# The pH depends on the coordinates


ph_vgm <- variogram(pH ~ x + y, ca_geo[!miss, ])
plot(ph_vgm)

Variogram model fitting


Next you'll fit a model to your variogram. The gstat function fit.variogram() does
this. You need to give it some initial values as a starting point for the
optimization algorithm to fit a better model.

The sill is the the upper limit of the model. That is, the long-range largest
value, ignoring any outliers.
# ca_geo, miss, ph_vgm have been pre-defined
ls.str()

# Eyeball the variogram and estimate the initial parameters


nugget <- .15
psill <- .15
range <- 10000

# Fit the variogram


v_model <- fit.variogram(
ph_vgm,
model = vgm(
model = "Ste",
nugget = nugget,
psill = psill,
range = range,
kappa = 0.5
)
)

# Show the fitted variogram on top of the binned variogram


plot(ph_vgm, model = v_model)
print(v_model)

Filling in the gaps


The final part of geostatical estimation is kriging itself. This is the application
of the variogram along with the sample data points to produce estimates and
uncertainties at new locations.

The computation of estimates and uncertainties, together with the assumption of a


normal (Gaussian) response means you can compute any function of the estimates -
for example the probability of a new location having alkaline soil.

# ca_geo, miss, v_model have been pre-defined


ls.str()

# Set the trend formula and the new data


km <- krige(pH ~ x + y, ca_geo[!miss, ], newdata = ca_geo[miss, ], model = v_model)
names(km)

# Plot the predicted values


spplot(km, "var1.pred")

# Compute the probability of alkaline samples, and map


km$pAlkaline <- 1 - pnorm(7, mean = km$var1.pred, sd = sqrt(km$var1.var))
spplot(km, "pAlkaline")

Making a prediction grid


You have been asked to produce an alkaline probability map over the study area. To
do this, you are going to do some kriging via the krige() function. This requires a
SpatialPixels object which will take a bit of data manipulation to create. You
start by defining a grid, creating points on that grid, cropping to the study
region, and then finally converting to SpatialPixels. On the way, you'll meet some
new functions.
GridTopology() defines a rectangular grid. It takes three vectors of length two as
inputs. The first specifies the position of the bottom left corner of the grid. The
second specifies the width and height of each rectangle in the grid, and the third
specifies the number of rectangles in each direction.

To ensure that the grid and the study area have the same coordinates, some
housekeeping is involved. SpatialPoints() converts the points to a coordinate
reference system (CRS), or projection (different packages use different terminology
for the same concept). The CRS is created by wrapping the study area in
projection(), then in CRS(). For the purpose of this exercise, you don't need to
worry about exactly what these functions do, only that this data manipulation is
necessary to align the grid and the study area.

Now that you have that alignment, crop(), as the name suggests, crops the grid to
the study area.

Finally, SpatialPixels() converts the raster cropped gridpoints to the equivalent


sp object.

The acidity survey data, ca_geo, the missing value index, miss, the variogram, vgm,
and the variogram model, v_model, have been pre-defined.

A rough outline of the study area is in an object called geo_bounds.

Use bbox(geo_bounds) to get the corners of the grid.


Construct a rectangular grid over the region.
Call GridTopology().
Round the position of the bottom left corner to the nearest integer and pass it as
the first argument.
Assign the result to grid.
Align the grid coordinate with the study area boundary coordinates.
Call SpatialPoints().
Pass grid as the first argument.
Pass the coordinate reference system of the study area boundary to the proj4string
argument.
Assign the result to gridpoints.
Plot this object to see your progress.
Crop the grid points to the shape of the boundary object.
Call crop().
Pass the grid points as the first argument.
Pass the study area boundary as the second argument.
Assign the result to cropped_gridpoints.
Plot this object to see your progress.
Convert the cropped grid points to a SpatialPixels object.
Call SpatialPixels().
Pass cropped_gridpoints as the only argument.
Assign the result to spgrid.
Plot this object to see your progress.

# ca_geo, geo_bounds have been pre-defined


ls.str()

# Plot the polygon and points


plot(geo_bounds); points(ca_geo)

# Find the corners of the boundary


bbox(geo_bounds)

# Define a 2.5km square grid over the polygon extent. The first parameter is
# the bottom left corner.
grid <- GridTopology(c(537853,5536290), c(2500, 2500), c(72, 48))

# Create points with the same coordinate system as the boundary


gridpoints <- SpatialPoints(grid, proj4string = CRS(projection(geo_bounds)))
plot(gridpoints)

# Crop out the points outside the boundary


cropped_gridpoints <- crop(gridpoints, geo_bounds)
plot(cropped_gridpoints)

# Convert to SpatialPixels
spgrid <- SpatialPixels(cropped_gridpoints)
coordnames(spgrid) <- c("x", "y")
plot(spgrid)

Gridded predictions
Constructing the grid is the hard part done. You can now compute kriged estimates
over the grid using the variogram model from before (v_model) and the grid of
SpatialPixels.

Instructions
100 XP
The spatial pixel grid of the region, spgrid, and the variogram model of pH,
v_model have been pre-defined.

Use kriging to predict pH in each grid rectangle throughout the study area.
Call krige().
The formula and input data are already specified.
Pass spgrid as the new data to predict.
Pass the variogram model to the model argument.
Calculate the probability of alkaline samples in each grid rectangle.
The mean of the predictions is the var1.pred element of ph_grid.
The variance of the predictions is the var1.var element of ph_grid. Take the square
root to get the standard deviation.
Plot the alkalinity in each grid rectangle.
Call spplot().
Pass the alkalinity column to the zcol argument as a string.

# spgrid, v_model have been pre-defined


ls.str()

# Do kriging predictions over the grid


ph_grid <- krige(pH ~ x + y, ca_geo[!miss, ], newdata = spgrid, model = v_model)

# Calc the probability of pH exceeding 7


ph_grid$pAlkaline <- 1 - pnorm(7, mean = ph_grid$var1.pred, sd =
sqrt(ph_grid$var1.var))

# Map the probability of alkaline samples


spplot(ph_grid, zcol = "pAlkaline")
Auto-kriging at point locations
The autoKrige() function in the automap package computes binned variograms, fits
models, does model selection, and performs kriging by making multiple calls to the
gstat functions you used previously. It can be a great time-saver but you should
always check the results carefully.

In this example you will get predictions at the missing data locations.

autoKrige() can try several variogram model types. In the example, you'll use a
Matern variogram model, which is commonly used in soil and forestry analyses. You
can see a complete list of available models by calling vgm() with no arguments.

Instructions
100 XP
The acidity survey data, ca_geo, and the missing value index, miss, have been pre-
defined.

Call autoKrige() to automatically run a kriging model.


Set the formula for modeling acidity versus the position, as before.
The input_data is the non-missing data from the survey.
The new_data is the missing data from the survey.
Set the model to "Mat". (Note the capital M.)
Assign the result to ph_auto.
Call plot() on ph_auto to see the results.

# ca_geo, miss are pre-defined


ls.str()

# Kriging with linear trend, predicting over the missing points


ph_auto <- autoKrige(
pH ~ x + y,
input_data = ca_geo[!miss, ],
new_data = ca_geo[miss, ],
model = "Mat"
)

# Plot the variogram, predictions, and standard error


plot(ph_auto)

Auto-kriging over a grid


You can also use autoKrige() over the spgrid grid from the earlier exercise. This
brings together all the concepts that you've learned in the chapter. That is,
kriging is great for predicting missing data, plotting things on a grid is much
clearer than plotting individual points, and automatic kriging is less hassle than
manual kriging.

Instructions
100 XP
The acidity survey data, ca_geo, the missing value index, miss, the spatial pixel
grid of the region, spgrid, the manual kriging grid model, ph_grid, and the
variogram model of pH, v_model have been pre-defined.

Automatically fit a kriging model.


Call autoKrige().
The first argument is the same formula you've used throughout the chapter.
The input_data argument contains the non-missing points from the survey data.
The new_data argument is the grid of prediction locations.
Assign the result to ph_auto_grid.
To remind yourself of the manual kriging predictions, plot ph_grid.
Plot ph_auto_grid. Do the predictions look similar or different?
To compare the manual and automated variogram models, print v_model the var_model
element of ph_auto_grid.

# ca_geo, miss, spgrid, ph_grid, v_model are pre-defined


ls.str()

# Auto-run the kriging


ph_auto_grid <- autoKrige(pH ~ x + y, input_data = ca_geo[!miss,], new_data =
spgrid)

# Remember predictions from manual kriging


plot(ph_grid)

# Plot predictions and variogram fit


plot(ph_auto_grid)

# Compare the variogram model to the earlier one


v_model
ph_auto_grid$var_model

######

Your job is ggplot stacked bar-chart, then into a pie-chart by using the transform
coord_polar(theta = 'y').
Notice x = 1 in the aesthetics. This is because we only want one bar chart here.

# Wrangle data into form we want.


disease_counts <- who_disease %>%
mutate(disease = ifelse(disease %in% c('measles', 'mumps'), disease,
'other')) %>%
group_by(disease) %>%
summarise(total_cases = sum(cases))

# Use a column polar coordinate system to polar and set theta to 'y'.
ggplot(disease_counts, aes(x = 1, y = total_cases, fill = disease)) +
geom_col() +
coord_polar(theta = 'y')+
theme_void() +
ggtitle('Title')

##waffle !!!
disease_counts <- who_disease %>%
group_by(disease) %>%
summarise(total_cases = sum(cases)) %>%
mutate(percent = round(total_cases/sum(total_cases)*100))
# Create an array of rounded percentages for diseases.
# Name the percentage array with disease_counts$disease
# Pass case_counts vector to the waffle function to plot
case_counts <- disease_counts$percent
names(case_counts) <- disease_counts$disease
waffle(case_counts)

##filled bar --- geom_bar()


disease_counts <- who_disease %>%
mutate(disease = ifelse(disease %in% c('measles', 'mumps'), disease,
'other')) %>%
group_by(disease, year) %>% # note the addition of year to the grouping.
summarise(total_cases = sum(cases))
# add the mapping of year to the x axis. # Change the position = to make
bars full height
ggplot(disease_counts, aes(year, y = total_cases, fill = disease)) +
geom_col(position = 'fill')

disease_counts <- who_disease %>%


mutate(
disease = ifelse(disease %in% c('measles', 'mumps'), disease, 'other')
%>%
factor(levels = c('measles', 'other', 'mumps') ) # change factor levels
to desired ordering
) %>%
group_by(disease, year) %>%
summarise(total_cases = sum(cases))
# plot
ggplot(disease_counts, aes(x = year, y = total_cases, fill = disease)) +
geom_col(position = 'fill')

disease_counts <- who_disease %>%


filter(year >= 1999) %>%
mutate(disease = ifelse(disease %in% c('measles', 'mumps'), disease,
'other')) %>%
group_by(disease, region) %>% # Add region column to grouping
summarise(total_cases = sum(cases))
# Set aesthetics so disease is the stacking variable, region is the x-axis and
counts are the y
ggplot(disease_counts, aes(region, total_cases, fill = disease )) +
# Add a column geometry with the proper position value.
geom_col(position = 'fill')

who_subset <- who_disease %>%


filter(countryCode %in% interestingCountries, disease == 'measles', year %in%
c(2006, 2016) # Modify years to 1992 and 2002) %>%
mutate(year = paste0('cases_', year)) %>%
spread(year, cases)
# Reorder y axis and change the cases year to 1992
ggplot(who_subset, aes(x = log10(cases_2006), y = country)) +
geom_point()

who_subset <- who_disease %>%


filter(countryCode %in% interestingCountries, disease == 'measles',year %in%
c(1992, 2002) ) %>%
mutate(year = paste0('cases_', year)) %>%
spread(year, cases)
# Reorder y axis and change the cases year to 1992
ggplot(who_subset, aes(x = log10(cases_1992), y = reorder(country, cases_1992) )) +
geom_point()

# set y axis as country ordered with respect to logFoldChange


who_subset %>%
mutate(logFoldChange = log2(cases_2016/cases_2006)) %>%
ggplot(aes(x = logFoldChange, y = reorder(country,logFoldChange) )) +
geom_point() +
geom_vline(xintercept = 0) # add a visual
anchor at x = 0
xlim(-6,6) +
facet_grid(region~. , scale = 'free_y') # add facet_grid
arranged column by region and free_y scales

amr_pertussis <- who_disease %>%


filter( region == 'AMR', year == 1980, disease == 'pertussis')
# x axis as country ordered by cases,flip axes
ggplot(amr_pertussis, aes(x = reorder(country, cases), y = cases)) +
geom_col() +
coord_flip()

amr_pertussis %>% filter(cases > 0) %>%


ggplot(aes(x = reorder(country, cases), y = cases)) +
geom_col() +
coord_flip() +
theme( panel.grid.major.y = element_blank()) # get rid of the
'major' y grid lines !!! theme

amr_pertussis %>% filter(cases > 0) %>%


ggplot(aes(x = reorder(country, cases), y = cases)) + )
geom_point(size = 2) + # switch to
points,size=2
scale_y_log10() +
theme_minimal() +
coord_flip()

# histogram of speed_over # Lower alpha to 0.7


ggplot(md_speeding) +
geom_histogram( aes(speed_over), alpha = .7) +
theme_minimal()

# set x and y to hour_of_day and stat(density)


ggplot(md_speeding) +
geom_histogram(aes(hour_of_day, stat(density) ), alpha = .8)

# filter data to just heavy duty trucks


truck_speeding <- md_speeding %>%
filter(vehicle_type == "Heavy Duty Truck")

ggplot(truck_speeding, aes(x = hour_of_day)) +


geom_density(fill = 'steelblue', bw = 1.5) +
labs(title = 'Citations by hour', subtitle = 'Gaussian kernel SD = 1.5')

# add rug plot using geom_rug to see individual datapoints, set alpha to 0.5.
ggplot(truck_speeding, aes(x = hour_of_day)) +
geom_density(bw = 1.5, fill = 'steelblue', alpha = .7) +
geom_rug(alpha = .5) +
labs(title = 'Citations by hour', subtitle = "Gaussian kernel SD = 1.5")

# Increase bin width to 2.5, # lower rugplot alpha to 0.05


ggplot(md_speeding, aes(x = percentage_over_limit)) +
geom_density(fill = 'steelblue', bw = 2.5, alpha = 0.7) +
geom_rug(alpha = 0.05) +
labs(title = 'Distribution of % over speed limit',
subtitle = "Gaussian kernel SD = 2.5")
# add a boxplot geometry
md_speeding %>%
filter(vehicle_color == 'RED') %>%
ggplot(aes(gender, speed)) +
geom_boxplot() +
labs(title = 'Speed of red cars by gender of driver')

# add jittered points # make boxplot transparent with alpha = 0


md_speeding %>% filter(vehicle_color == 'RED') %>%
ggplot(aes(x = gender, y = speed)) +
geom_jitter(color = 'steelblue', alpha = .3 ) +
geom_boxplot(alpha = 0) +
labs(title = 'Speed of red cars by gender of driver')

# remove color filter # facet_wrap by vehicle_color


md_speeding %>%
ggplot(aes(x = gender, y = speed)) +
geom_jitter(alpha = 0.3, color = 'steelblue') +
geom_boxplot(alpha = 0) +
facet_wrap(~vehicle_color) +
labs(title = 'Speed of different car colors, separated by gender of driver')

library(ggbeeswarm)
md_speeding %>%
filter(vehicle_color == 'RED') %>%
ggplot(aes(x = gender, y = speed)) +
# change point size to 0.5 and alpha to 0.8
geom_beeswarm(cex = .5, alpha = .8) +
# add a transparent boxplot on top of points
geom_boxplot(alpha = 0)

# violin geometry with kernel width of 2.5, add individual points on top of violins
md_speeding %>%
filter(vehicle_color == 'RED') %>%
ggplot(aes(x = gender, y = speed)) +
geom_violin(bw = 2.5) +
geom_point(alpha = .3, size = .5)

# add a transparent boxplot and shrink its width to 0.3


# Reset point size to default and set point shape to 95
# Supply a subtitle detailing the kernel width
md_speeding %>%
filter(vehicle_color == 'RED') %>%
ggplot(aes(x = gender, y = speed)) +
geom_violin(bw = 2.5) +
geom_boxplot(alpha = 0, width = .3) +
geom_point(alpha = 0.3, shape = 95) +
labs(subtitle = 'Gaussian kernel SD = 2.5')

#geom_violin --- fill, geom_jitter --- color


#replace with violin plot with kernel width of 2.5, change color argument to fill
md_speeding %>%
ggplot(aes(x = gender, y = speed)) +
geom_violin(bw = 2.5, fill = 'steelblue') +
geom_boxplot(alpha = 0, width = .3) +
facet_wrap(~vehicle_color) +
labs(title = 'Speed of different car colors, separated by gender of driver',
subtitle = 'Gaussian kernel width: 2.5')
library(ggridges)
md_speeding %>%
mutate(day_of_week = factor(day_of_week, levels =
c("Mon","Tues","Wed","Thu","Fri","Sat","Sun") )) %>%
ggplot(aes( x = percentage_over_limit, y = day_of_week)) +
geom_density_ridges(bandwidth = 3.5) +
scale_x_continuous(limits = c(0,150))
labs(subtitle='Gaussian kernel SD = 3.5')

### element_blank()
#make ridgeline densities a bit see-through
md_speeding %>%
mutate(day_of_week = factor(day_of_week, levels =
c("Mon","Tues","Wed","Thu","Fri","Sat","Sun") )) %>%
ggplot(aes( x = percentage_over_limit, y = day_of_week)) +
geom_density_ridges(bandwidth = 3.5, alpha = .7) +
scale_x_continuous(limits = c(0,150), expand = c(0,0) ) +
labs(subtitle = 'Guassian kernel SD = 3.5') +
theme(axis.ticks.y = element_blank())

#joyplot with rugs


# semi-transparent with alpha = 0.2, |, nudge downward 0.05
md_speeding %>%
mutate(day_of_week = factor(day_of_week, levels =
c("Mon","Tues","Wed","Thu","Fri","Sat","Sun") )) %>%
ggplot(aes( x = percentage_over_limit, y = day_of_week)) +
geom_point( alpha = .2, shape = '|', position= position_nudge(y = -.05) ) +
geom_density_ridges(bandwidth = 3.5, alpha = 0.7) +
scale_x_continuous(limits = c(0,150), expand = c(0,0)) +
labs(subtitle = 'Guassian kernel SD = 3.5') +
theme( axis.ticks.y = element_blank() )

#####
library(lattice)

histogram(~ Ozone, data = airquality)


# nint = number of bins, quantity displayed 'percent, 'count' or 'density'
histogram(~Ozone, data = airquality, nint = 15, type = "count")

xyplot(Ozone ~ Solar.R, data = airquality)


xyplot(Ozone ~ Solar.R, data = airquality. main = 'Main Title' , xlab = 'x', ylab =
'y' )

densityplot(~ Ozone, data = airquality, plot.points = 'jitter')

str(USCancerRates) # 'USCancerRates' is pre-loaded


library(dplyr)
USCancerRates %>%
mutate(state.ordered = reorder(state , rate.female , median, na.rm = TRUE))

# Create box and whisker plot


bwplot(state.ordered ~ rate.female, data = USCancerRates)
bwplot(state.ordered ~ rate.female, data = USCancerRates, coef = 0) # Change
whiskers extent
histogram(~ Ozone | factor(Month), data = airquality, layout = c(2,3), xlab =
'Ozone (ppb)' )

# + and outer = T >> trellis # Suppress data points # Add


a reference line

densityplot(~ rate.male + rate.female , data = USCancerRates, outer = TRUE,


plot.points = F, ref = T)

# Create a density plot


densityplot(~ rate.male + rate.female , data = USCancerRates, outer = F, xlab =
'Rate (per 100,000)',
auto.key = T, plot.points = F, ref = T)

# Complete the legend spec --- list(space, title, columns, text)


xyplot(Ozone ~ Temp, airquality, groups = Month,
auto.key = list(space = 'right', title = 'Month', text = month.name[5:9]))

USCancerRates %>% mutate(division.ordered = reorder(division, rate.male, mean,


na.rm = TRUE))
# Create conditioned scatter plot
# as.table = TRUE, levels() function matches the order of the panels
xyplot(rate.female ~ rate.male | division.ordered, data = USCancerRates, grid = T,
abline = c(0,1), as.table = T)

# Create box-and-whisker plot


bwplot(division.ordered ~ rate.male + rate.female, data = USCancerRates, outer = T,

xlab = "Rate (per 100.000)", strip = strip.custom(factor.levels


=c('Male','Female') ) )

#trellis
# 'USCancerRates' is pre-loaded
str(USCancerRates)

# Create "trellis" object


tplot <-
densityplot(~ rate.male + rate.female | division.ordered, data = USCancerRates,
outer = T, plot.points = F, as.table = T)

# Change names for the second dimension


dimnames(tplot)[[2]] <- c("Male", "Female")
# Update x-axis label and plot
update(tplot, xlab = 'Rate')

tplot <- densityplot(~ rate.male + rate.female | division.ordered, data =


USCancerRates, outer = T, plot.points = F, as.table = T)

# Inspect dimension
dim(tplot)
dimnames(tplot)

# Select subset retaining only last three divisions


tplot[7:9, ]
If you want to have different axis limits for different panels on the same graph,
using the relation = "free" sub-component to the scales argument allows you to
specify limits for each panel separately.

# Specify limits for each panel


dotplot(Cause ~ Rate | Sex + Status, data = USMortality,
as.table = T,
scales = list(x = list(relation = "free", limits = list(c(0, 50), c(0, 80),
c(0, 50), c(0, 80) ))))

sub-components of the scales argument are:


tick.number: approximate number of tick marks / labels.
alternating: 1 puts labels on the left/bottom boundary, 2 top/right, and 3 both
sides. The value can be a vector, in which case it applies row-wise or column-wise.
rot: angle in degrees to rotate axis labels.

dotplot(Cause ~ Rate | Sex + Status, data = USMortality,


as.table = TRUE,
# Change the number of tick marks
scales = list(x = list(tick.number = 10,
# Show `Rate` labels on both bottom and top
alternating = 3,
# Rotate `Rate` labels by 90 degrees
rot = 90),
# Rotate `Cause` labels by 45 degrees
y = list(rot = 45)))

We have seen how the scales argument can be used to control various aspects of how
the coordinate axes are computed and annotated. A common use of the scales argument
is to explicitly specify tick mark locations using the at sub-component, and
optionally the associated text labels using the labels sub-component.

# Define at as 2^3 up to 2^8


x_ticks_at <- c(2^3, 2^4, 2^5, 2^6, 2^7, 2^8)

# A numeric vector with values 2^3, 2^4, ..., 2^8


# A character vector, # "8" for 2^3, "16" for 2^4, etc.

dotplot(Cause ~ Rate | Sex, data = USMortality,


groups = Status, auto.key = list(columns = 2),
scales = list(x = list(log = 2, at = x_ticks_at, labels = x_ticks_at )))

Log scales are useful for economic metrics which tend to show exponential growth
over time.
In this exercise we will create a dot plot of the WorldPhones dataset available in
R.

In the video, you learned how to log-transform the axis by specifying a suitable
base as the log component of scales. There is one more component you need to know,
equispaced.log. This component indicates if the tick marks are equispaced when log
scales are in use. By default, equispaced.log is set to TRUE. Note: If you set
equispaced.log = FALSE, you don't have to explicitly specify a base for the log
component; just log = TRUE should do the trick!

The dataset is stored as a matrix, not a data frame as expected by lattice, so we


first need to transform it. Base R can create a tidy data frame from a matrix by
converting it to a table first. That is, you use as.table() followed by
as.data.frame().

names(dimnames(WorldPhones)) <- c("Year", "Region")

# Transform matrix data to data frame # Intermediate step: convert to table


WorldPhonesDF <- as.data.frame(as.table(WorldPhones), responseName = "Phones")

# Create the dot plot # Log-transform the x-axis # Set x-axis relation to
"sliced"
dotplot(Year ~ Phones | Region, data = WorldPhonesDF,
as.table = T, scales = list(x = list(log = T, equispaced.log = F, relation
= "sliced")))

# !!! as.table() vs table()

# Load latticeExtra package for ggplot2like()


library(latticeExtra)

# Transform matrix data to data frame


names(dimnames(WorldPhones)) <- c("Year", "Region")
WorldPhonesDF <- as.data.frame(as.table(WorldPhones[-1, ]), responseName =
"Phones")

# Create the dot plot


dotplot(Year ~ Phones | Region, data = WorldPhonesDF,
as.table = T, scales = list(x = list(log = TRUE, equispaced.log = FALSE,
relation = "sliced")),
# Fill in suitable value of par.settings
par.settings = ggplot2like(),
# Fill in suitable value of lattice.options
lattice.options = ggplot2like.opts() )

# Create factor variable #month.name is builtin variable


airquality$Month.Name <- factor(airquality$Month, levels = 1:12, labels =
month.name)

# Create histogram of Ozone, conditioning on Month


histogram(~ Ozone | Month.Name, data = airquality, as.table = T,
# Set border to be transparent, fill mid-gray
border = 'transparent', col = 'grey50')

Changing plotting characters


Grouped displays in lattice normally use color to distinguish between groups.
Sometimes it is also helpful to use different plotting symbols or line types for
different groups, so that they can be distinguished even in black and white
printouts. Plotting symbols (or plotting characters) are controlled by the
graphical parameter pch, which can be supplied as an optional argument. Many values
of pch are allowed, but you will use the two values 1 and 3 to distinguish between
males and females in the USMortality data. The value of 1 represents an open
circle, and the value of 3 represents a plus (+) symbol.

# Create the dot plot


dotplot(Cause ~ Rate | Status, data = USMortality,
groups = Sex, auto.key = list(columns = 2),
scales = list(x = list(log = T, equispaced.log = F)),
# Provide pch values for the two groups
pch = c(1, 3))

#recreate dot plot where for males (pch = 1, open circle) and females (pch = 3,
plus sign), reflected in the legend as well.
dotplot(Cause ~ Rate | Status, data = USMortality,
groups = Sex, auto.key = list(columns = 2),
par.settings = simpleTheme(pch = c(3,1) ),
scales = list(x = list(log = 2, equispaced.log = F)))

Convert the Month code into a new variable Month.Name containing month names,
suitably ordering the levels. Drop empty levels of Month.Name using droplevels().
Obtain five colors from the RColorBrewer::brewer.pal(n), n should be the desired
number of colors. Create a density plot of Ozone grouped by Month.Name, with line
colors taken from RColorBrewer, line width doubled, and legend on the right.

levels(airquality$Month.Name)

# Drop empty levels


airquality$Month.Name <- droplevels(airquality$Month.Name)
levels(airquality$Month.Name)

# Obtain colors from RColorBrewer


library(RColorBrewer)
my.colors <- brewer.pal(n = 5, name = "Set1")

#Densityplot: colorbrewed
densityplot(~ Ozone , data = airquality, groups = Month.Name,
plot.points = FALSE,
auto.key = list(space = 'right'),
par.settings = simpleTheme(col = my.colors, lwd = 2))

Scatter plot with rugs


In lattice, the ability to control and customize how the input data is converted
into a graphical display is implemented through panel functions. You need custom
panel functions when the default panel cannot produce what you want, even with
optional arguments.

The airquality dataset: A base R graphics scatter plot of ozone - solar radiation:
plot(Ozone ~ Solar.R, data = airquality)

However, these measurements include some missing values, which are omitted from the
scatter plot, but could be informative. One common approach to include them in the
plot is by adding marginal "rugs" after the main scatter plot has been plotted.

with(airquality,
{ na.ozone <- is.na(Ozone)
na.solar.r <- is.na(Solar.R)
rug(Ozone[na.solar.r], side = 2)
rug(Solar.R[na.ozone], side = 1)
})

The goal is to codify missing rugs in a panel function, additionally conditioning


on Month.

panel.xyrug <- function(x, y, ...)


{
# Reproduce standard scatter plot, rugs for missing data
panel.xyplot(x, y, ...)
# Identify observations with 1-value missing
x.missing <- is.na(x)
y.missing <- is.na(y)
# Draw rugs along axes
panel.rug(x = x[y.missing], y = y[x.missing])
}

airquality$Month.Name <- factor(month.name[airquality$Month], levels = month.name)


xyplot(Ozone ~ Solar.R | Month.Name, data = airquality, panel = panel.xyrug,
as.table = TRUE)

# Create factor variable with month names


airquality$Month.Name <- factor(month.name[airquality$Month], levels = month.name)

# Create box-and-whisker plot: # Specify outer, x-axis relation, layout, x-axis


label
bwplot(Month.Name ~ Ozone + Temp, airquality, outer = T, scales = list(x =
list(relation = 'free')), layout = c(2,1), xlab = 'Measured value')

# Create violin plot # Specify outer # Specify x-axis relation, layout, label,
panel function
bwplot(Month.Name ~ Ozone + Temp, airquality,
outer = T, scales = list(x = list(relation = 'free')), layout = c(2,1),
xlab = 'Measured value', panel = panel.violin)

Alternative panel functions - panel.smoothScatter()


When there are a large number of points in the data, there may be substantial
overplotting in a standard scatter plot. Another built-in panel function available
in the lattice package that can serve as a replacement for panel.xyplot() in such
cases is panel.smoothScatter(). Instead of plotting the points directly, it uses a
color gradient to show a 2-D kernel density estimate obtained from the data. It can
be used to show the relationship between death rates due to cancer among males and
females in the USCancerRates data as follows:

xyplot(rate.female ~ rate.male, USCancerRates,


panel = panel.smoothScatter, scales = list(log = TRUE, equispaced.log =
FALSE),
main = "County-wise deaths due to cancer")

You can compare the output of this code to the default scatter plot produced when
you omit the panel argument.
In this exercise you will customize this plot further by adding a nonparametric
LOESS smooth and a reference line along the y=x diagonal.

# Create panel function: panel.smoothScatter panel.loess panel.abline()


panel.ss <- function(x, y, ...) {
panel.smoothScatter(x, y, ...)
panel.loess(x, y, col = 'red')
panel.abline(0, 1)
}

# Create plot
xyplot(rate.female ~ rate.male, USCancerRates,
panel = panel.ss, main = "County-wise deaths due to cancer")
Prepanel functions with scales
While panel functions control the data display in each panel, prepanel functions
are used to set up the coordinate system for the display by calculating the minimal
axis extents required to contain the display. This calculation is done separately
for each panel, and these still need to be combined. As you have seen earlier, this
is controlled by the relation sub-component of the scales argument.
The goal of this exercise is to use the panel.histdens() and prepanel.histdens.2()
functions defined in the previous video exercise to reproduce the histogram of log-
tranformed death rates in the USCancerRates data, but with "sliced" x-axis limits
for each panel, so that each panel has different axis limits and has a common
range.
When fitting a model, this kind of plot can help to decide whether it is reasonable
to assume equal variance for the two subgroups.

# Define prepanel function


prepanel.histdens.2 <- function(x, ...) {
h <- prepanel.default.histogram(x, ...)
d <- density(x, na.rm = TRUE)
list(xlim = quantile(x, c(0.005, 0.995), na.rm = TRUE),
# Calculate upper y-limit
ylim = c(0, max(d$y, h$ylim[2] )))
}

# Create a histogram of rate.male and rate.female


histogram(~ rate.male + rate.female,
data = USCancerRates, outer = TRUE,
type = "density", nint = 50,
border = "transparent", col = "lightblue",
# The panel function: panel.histdens # The prepanel function:
prepanel.histdens.2
panel = panel.histdens, prepanel = prepanel.histdens.2,
# Ensure that the x-axis is log-transformed # and has relation "sliced"
scales = list(x = list(log = T, equispaced.log = F, relation =
'sliced')),
xlab = "Rate (per 100,000)")

Interaction plot of residuals

Graphics are used not only for initial exploration but also as an integral part of
the iterative process of model-building, where it is particularly useful for
analysis of residuals and other model diagnostics. Optional arguments that are
passed on to the default panel function make it relatively easy to create fairly
complex plots that are needed routinely in practice.

Suppose we fit the following linear model for the USRegionalMortality dataset based
on the previous interaction plot, and store the corresponding residuals in the
resid variable.

fm <- lm(log(Rate) ~ Cause * Sex + Status + Region, data = USRegionalMortality,


na.action = na.exclude)
USRegionalMortality$resids <- residuals(fm)

For this exercise, your task is to look at the following three interaction plots
and decide which of the explanatory variables are most likely to be needed in the
model.

xyplot(resids ~ Region, USRegionalMortality,


groups = Sex, grid = list(h = -1, v = 0),
type = c("p", "a"), jitter.x = T,
auto.key = list(lines = T, points = T),
scales = list(x = list(rot = 45)))
xyplot(resids ~ Region, USRegionalMortality,
groups = Status, grid = list(h = -1, v = 0),
type = c("p", "a"), jitter.x = T,
auto.key = list(lines = T, points = T),
scales = list(x = list(rot = 45)))
xyplot(resids ~ Sex, USRegionalMortality,
groups = Status, grid = list(h = -1, v = 0),
type = c("p", "a"), jitter.x = TRUE,
auto.key = list(lines = TRUE, points = TRUE),
scales = list(x = list(rot = 45)))

# Create the box and whisker plot


bwplot(division.ordered ~ rate.male,
data = USCancerRates,
# Indicate median by line instead of dot
pch = '|',
# Include notches for confidence interval
notch = T,
# The x-axis should plot log-transformed values
scales = list(x = list(log = T, equispaced.log = F ) ),
xlab = "Death Rate in Males (per 100,000)")

Using emojis as plotting symbols

For the last exercise in this chapter, your task is to recreate a grouped dot plot
you have seen before, but replace the plotting characters by emoji images. To do
so, you will use the panel.xyimage() function in the latticeExtra package, which is
similar to the panel.xyplot() function, except that plotting symbols are replaced
by images whose locations (file names or URLs of JPEG or PNG image files) are
specified as the pch argument.

# Load the 'latticeExtra' package


library(latticeExtra)

# Create summary dataset


USCancerRates.state <-
with(USCancerRates, {
rmale <- tapply(rate.male, state, median, na.rm = TRUE)
rfemale <- tapply(rate.female, state, median, na.rm = TRUE)
data.frame(Rate = c(rmale, rfemale),
State = rep(names(rmale), 2),
Gender = rep(c("Male", "Female"), each = length(rmale)))
})

# Reorder levels
library(dplyr)
USCancerRates.state <- mutate(USCancerRates.state, State = reorder(State, Rate))

# URLs for emojis


emoji.man <- "https://twemoji.maxcdn.com/72x72/1f468.png"
emoji.woman <- "https://twemoji.maxcdn.com/72x72/1f469.png"

# Create dotplot, grouped by Gender, using emoji (size 75%)


dotplot(State ~ Rate , data = USCancerRates.state,
groups = Gender, panel = panel.xyimage,
pch = c(emoji.woman, emoji.man ), cex = .75)

The xyplot() method for time series objects

By making use of the object-oriented programming facilities supported by R, high-


level lattice functions can be extended to support data structures other than data
frames. A common example of such data structures is provided by time series
objects. The xyplot() function has a suitable method for time series objects. Your
goal in this exercise is to use it to create a time series plot of the built-in
EuStockMarkets dataset, which gives daily closing prices of four major European
stock indices.

The function to create the time-series plot is simply xyplot(). Instead of a


formula and a data frame, the only mandatory argument is a time series object,
which must be the first argument.

The main features of this method are:


- type (default = "l"), so that data points are joined by lines.
- superpose (T/F, default F) plot multiple time series within the same panel or in
separate panels.
- cut, a list of the form list(number = , overlap = ), to produce so-called "cut-
and-stack" plots, by splitting the time axis into multiple overlapping periods
which are then used to condition. This makes it easier to see parts of a long
series.

lattice is less strict than ggplot2 about the format of the dataset. Here, the data
is a time series rather than a data frame.

# Use 'EuStockMarkets' time series data


# Plot all series together # Split up the time axis into parts
str(EuStockMarkets)
xyplot(EuStockMarkets, superpose = T, cut = list(number = 3, overlap = .25))

## The xyplot() method with panel.horizonplot()


# latticeExtra:: panel.horizonplot().

library(latticeExtra)
xyplot(EuStockMarkets, panel = panel.horizonplot, prepanel =
prepanel.horizonplot)

Map projections
latticeExtra::mapplot() displays numeric data associated with geographical regions,
as colors.
Since the earth is three dimensional but the plot is two dimensional, a projection
is required to reduce the number of dimensions. The list of available projections
is given in the Details section of the mapproject() help page: Mercator, polyconic
projection.
Map plots are drawn in two stages.
First, a map object is created using the maps::map() with plot = FALSE.
Second, mapplot() is called with a formula, a data frame, and a map.

the_map <- map("a_map_dataset", plot = FALSE, projection = "some_projection")


mapplot(region ~ value, data, map = the_map)

# Load required packages


library(maps)
library(latticeExtra)
# Create map object for US counties # Specify projection
county.map <- map("county", plot = F, fill = T, projection = 'sinusoidal')
# Create choropleth map # Specify map
mapplot(rownames(USCancerRates) ~ log10(rate.male) + log10(rate.female), data =
USCancerRates,
xlab = "", scales = list(draw = F), map = county.map)

Confidence bands using the latticeExtra::segplot()


Statistical estimates in the form of confidence intervals can be displayed using
segment plots. Calls to segplot() should be in the following form.
segplot(
categories ~ lower_limit + upper_limit,
data = some_data,
centers = point_estimates
)

Categories are displayed on the y-axis, and the confidence intervals are displayed
on the x-axis. The point estimates, usually a mean or median value for that
category, are specified using the centers argument, not the formula. An optional
argument, draw.bands, let's you choose between confidence bands and confidence
intervals. This argument is passed to the default panel function panel.segplot().

The estimated county-wise death rates in the USCancerRates also have associated 95%
confidence bounds, LCL95.male UCL95.male LCL95.female UCL95.female. Plotting the
confidence bounds for all counties is not useful because there are too many
counties. For this exercise, your goal is to plot the county-wise confidence
intervals for males for the state of Louisiana.

# Load required packages


library(latticeExtra)
library(dplyr)

# Create subset for Louisiana # Reorder levels of county


LACancerRates <- filter(USCancerRates, state == 'Louisiana') %>%
mutate(LACancerRates1, county = reorder(county, rate.male))
# Draw confidence intervals, point estimates - segments rather than bands
segplot(county ~ LCL95.male + UCL95.male, data = LACancerRates, centers =
rate.male, draw.bands = F )

Hexbinning of bivariate data


For datasets with a large number of observations: plot some form of bivariate
density estimate instead of the raw data, as is done with histograms and kernel
density plots for univariate data.

panel.smoothScatter() produces bivariate kernel density plots. A different


graphical design that is analogous to histograms uses hexagonal binning of the
plane, using color or radius to indicate the count in each bin.
hexbin::hexbinplot() for creating conditional hexbin plots using the lattice
framework.

Your task for this exercise is to use hexbinplot() to create a plot of death rates
among males and females in the USCancerRates dataset.
The formula and data argument in a hexbinplot() call is interpreted in the same way
as xyplot(). You will also use the following optional arguments:
- type argument can be set to "r" to add a regression line.
- trans argument can be a function that is applied to the observed counts before
creating bands for different colors. By default, the range of counts is divided up
evenly into bands, but taking the square root of the counts, for example,
emphasizes differences in the lower range of counts more.
- inv argument gives the inverse function of trans, so that transformed counts can
be converted back before being shown in the legend.

library(hexbin)
# Create hexbin plot, add regression line # function to transform and invert
transformed
hexbinplot(rate.female ~ rate.male, data = USCancerRates, type = "r", trans = sqrt,
inv = function(x) x^2)

The directlabels package (works with lattice and ggplot2)


tackles an interesting problem: instead of having a separate legend associating
graphical parameters and levels of a grouping variable, it tries to indicate the
grouping by placing text labels within the panel. This is generally tricky to do
automatically. directlabels relies on heuristics, and also allows the user to
provide their own heuristics.

Your goal is to add within-panel group identification in a density plot of the


airquality dataset grouped by month. To make things interesting, you will create
side by side grouped density plots of two different variables rather than just one.

library(directlabels)
# Create factor variable
airquality$Month.Name <- factor(month.name[airquality$Month], levels = month.name)

# Create density plot object


# Variables in different panels, # Specify grouping variable
# Suppress display of data points, # Add reference line
# Specify layout # Omit strip labels
# Provide column-specific x-axis labels # Let panels have independent scales
tplot2 <- densityplot( ~ Ozone + Temp, data = airquality,
outer = T, groups = Month.Name,
plot.points = F, ref = T,
layout = c(2,1), strip = F,
xlab = c('Ozone (ppb)', 'Temperature (F)') ,
scales = list(x = list(relation = 'free') ) )

# Produce plot with direct labels


direct.label(tplot2)

Adding ggplot2-style layers


lattice plot object can be modified using the update(). A new panel function can be
provided as the panel argument, to change or enhance the panel display. Specifying
the display in the form of a function can be cumbersome, especially for minor
changes. An alternative approach, implemented in the latticeExtra package, is to
add so-called layers to the existing display. This is modeled on the approach used
by the ggplot2 package. There are two kinds of layers: layer_() under and layer()
over, and glayer_() and glayer() for grouped displays.

A layer is created by putting a function call, as it would appear inside a panel


function, inside a call to layer_() or layer(). For example, suppose you want to
create a layer with a call to panel.grid that goes under the display, and a call to
panel.lmline() that goes above, and then add it to an existing lattice plot p.
Layers are added to a plot using the + operator.

under_layer <- layer_(panel.grid())


over_layer <- layer(panel.lmline(x, y))
p + under_layer + over_layer

# Load the 'latticeExtra' package


library(latticeExtra)

# Create scatter plot # Change plotting character


p <- xyplot(rate.female ~ rate.male, data = USCancerRates, pch = 16, alpha = .25)
# Create underlayer with reference grid, overlayer reference line and regression
fit
l0 <- layer_(panel.grid() )
l1 <- layer(panel.abline(0, 1))
l2 <- layer(panel.smoother(x, y, 'lm'))
# Combine and plot
P + l0 + l1 + l2

#### GEOSPATIAL R

What does this look like? You've seen how you might make a basic plot of the sales:

ggplot(sales, aes(lon, lat)) +


geom_point()

An equivalent way to specify the same plot is:

ggplot() +
geom_point(aes(lon, lat, size = $, col = $), data = sales)

corvallis_map_bw <- get_map(corvallis, source = 'stamen', maptype = 'toner', zoom =


13)
# Edit to display toner map
ggmap(corvallis_map_bw) +
geom_point(aes(lon, lat, color = year_built), data = sales)
#???
ggmap(corvallis_map_bw,
base_layer = ggplot(sales, aes(lon, lat, color = year_built)) )+
geom_point()
### expected
ggmap(corvallis_map_bw,
base_layer = ggplot(sales, aes(lon, lat))) +
geom_point(aes(color = year_built))

ggmap(corvallis_map_bw,
base_layer = ggplot(sales, aes(lon, lat))) +
geom_point(aes(color = class)) +
facet_wrap(~class)

A quick alternative

Like qplot() in ggplot2, qmplot() is less flexible than a full specification, but
often involves significantly less typing. qmplot() replaces both
downloading/displaying the map and its syntax is a blend between qplot(),
get_map(), and ggmap().
Using qmplot():

qmplot(lon, lat, data = sales, geom = "point", color = class) +


facet_wrap(~ class)

We didn't specify a map, since qmplot() will grab one on its own. qmplot() call
looks a lot like the qplot() call: use points to display the sales data, mapping
lon to the x-axis, lat to the y-axis, and class to color. qmplot() also sets the
default dataset and mapping (without the need for base_layer) so you can add facets
without any extra work.

qmplot(lon, lat, data = sales, geom = 'point', color = bedrooms) +


facet_wrap(~ month)

# geom_path geom_polygon

ggplot(ward_sales, aes(lon, lat)) + geom_polygon(aes(fill = ward, group = group) )

# Fix the polygon cropping


ggmap(corvallis_map_bw, extent = 'normal', maprange= F, ## maprange
base_layer = ggplot(ward_sales, aes(lon, lat))) +
geom_polygon(aes(group = group, fill = ward)) +
geom_path(aes(group = group))

### alternative
qmplot(lon, lat, data = ward_sales, geom = "polygon", group = group, fill =
avg_price)

# Repeat again, but map fill to avg_price


ggmap(corvallis_map_bw,
base_layer = ggplot(ward_sales, aes(lon, lat)),
extent = "normal", maprange = FALSE) +
geom_polygon(aes(group=group, fill = avg_price), alpha=0.8)

Raster data as a heatmap. The predicted house prices in preds are called raster
data: you have a variable measured (or in this case predicted) at every location in
a regular grid.

Looking at head(preds) in the console, you can see the lat values stepping up in
intervals of about 0.002, as lon is constant. After 40 rows, lon increases by about
0.003, as lat runs through the same values. For each lat/lon location, you also
have a predicted_price. You'll see later in Chapter 3, that a more useful way to
think about (and store) this kind of data is in a matrix.

When data forms a regular grid, one approach to displaying it is as a heatmap.


geom_tile() in ggplot2 draws a rectangle that is centered on each location that
fills the space between it and the next location, in effect tiling the whole space.
By mapping a variable to the fill aesthetic, you end up with a heatmap.

ggplot(preds, aes(lon, lat)) +


geom_tile(aes(fill = predicted_price) )

# Use ggmap() instead of ggplot()


ggmap(corvallis_map_bw) +
geom_tile(aes(lon, lat, fill = predicted_price), data = preds, alpha = 0.8 )

Spatial OBject
# Call str() on countries_sp
str(countries_sp)

# Call str() on countries_sp with max.level = 2


str(countries_sp, max.level = 2)

SPDF = S4 Object (OOP) = S3 + slot


# 169th element of countries_spdf@polygons: one
one <- countries_spdf@polygons[[169]]

# str() with max.level = 2, on the Polygons slot of one


str(one@Polygons, max.level = 2)

# str() with max.level = 2, on the 6th element of the one@Polygons


str(one@Polygons[[6]] , max.level = 2)

# Call plot on the coords slot of 6th element of one@Polygons


plot(one@Polygons[[6]]@coords )
## Since one@Polygons[[6]]@coords is just a matrix, plot() uses the default plot
method, not the one for spatial objects.

## SPDF > Polygons > Polygons > Polygon

# Subset the 169th object of countries_spdf: usa


usa <- countries_spdf[169,]

# Pull out the name column using $


countries_spdf$name

# Pull out the subregion column using [[ use ""


countries_spdf[["subregion"]]

# Create logical vector: is_nz


is_nz <- countries_spdf$name == 'New Zealand'
# Subset countries_spdf using is_nz: nz
nz <- countries_spdf[is_nz,]

# Plot nz
plot(nz)

###
library(sp)
library(tmap)

# Use qtm() to create a choropleth map of gdp


qtm(shp = countries_spdf, fill = "gdp")

# Add style argument to the tm_fill() call


tm_shape(countries_spdf) +
tm_fill(col = "population", style = "quantile") +
# Add a tm_borders() layer
tm_borders(col = "burlywood4")

# New plot, with tm_bubbles() instead of tm_fill()


tm_shape(countries_spdf) +
tm_bubbles(size = "population") +
# Add a tm_borders() layer
tm_borders(col = "burlywood4")

# Switch to a Hobo–Dyer projection


tm_shape(countries_spdf, projection = 'hd') +
tm_grid(n.x = 11, n.y = 11) +
tm_fill(col = "population", style = "quantile") +
tm_borders(col = "burlywood4")

# Switch to a Robinson projection


tm_shape(countries_spdf, projection = 'robin') +
tm_grid(n.x = 11, n.y = 11) +
tm_fill(col = "population", style = "quantile") +
tm_borders(col = "burlywood4")

# Add tm_style_classic() to your plot


tm_shape(countries_spdf, projection = 'robin') +
tm_grid(n.x = 11, n.y = 11) +
tm_fill(col = "population", style = "quantile") +
tm_borders(col = "burlywood4") +
tm_style_classic()

# Plot from last exercise


tm_shape(countries_spdf) +
tm_grid(n.x = 11, n.y = 11, projection = "longlat") +
tm_fill(col = "population", style = "quantile") +
tm_borders(col = "burlywood4")

# Save a static version "population.png"


save_tmap(filename = 'population.png') #deprecated, use tmap_save()

# Save an interactive version "population.html"


save_tmap(filename = 'population.html')
A package that uses Raster objects

the tmap package makes visualizing spatial classes in sp easy. it works with the
raster classes too! You simply pass your Raster___ object as the shp argument to
the tm_shape(), and then add a tm_raster():

tm_shape(raster_object) +
tm_raster()

When working with a RasterStack or a RasterBrick object, such as the pop_by_age


object you created in the last exercise, you can display one of its layers using
the col (short for "color") argument in tm_raster(), surrounding the layer name in
quotes.

You'll work with tmap throughout the course, but we also want to show you another
package, rasterVis, also designed specifically for visualizing raster objects.
There are a few different functions you can use in rasterVis to make plots, but
let's just try one of them for now: levelplot().

ibrary(tmap)

# Specify pop as the shp and add a tm_raster() layer


tm_shape(shp = pop ) +
tm_raster()

# Plot the under_1 layer in pop_by_age #### CAREFUL!


tm_shape(shp = pop_by_age ) + tm_raster(col = "under_1")

library(rasterVis)
# Call levelplot() on pop
levelplot(pop)

library(RColorBrewer)
blups <- brewer.pal(n = 9, 'BuPu')
ggplot(preds) +
geom_tile(aes(lon, lat, fill = predicted_price), alpha = 0.8) +
scale_fill_gradientn(colors = blups)

library(viridisLite)
vir = viridis(9)
ggplot(preds) +
geom_tile(aes(lon, lat, fill = predicted_price), alpha = 0.8) +
scale_fill_gradientn(colors = vir)

library(viridisLite)
mag <- magma(9)
ggplot(preds) +
geom_tile(aes(lon, lat, fill = predicted_price), alpha = 0.8) +
scale_fill_gradientn(colors = mag)

### Shortcut to get RColorBrewer palette. Add scale_xxx_distiller and you only need
to specify the palette name in the palette argument. See ?scale_fill_distiller.

Custom palette in tmap

Unlike ggplot2, where setting a custom color scale happens in a scale_ call, colors
in tmap layers are specified in the layer in which they are mapped. For example,
take a plot of the age_18_24 variable from prop_by_age:

tm_shape(prop_by_age) +
tm_raster(col = "age_18_24")

Since color is mapped in the tm_raster() call, the specification of the palette
also occurs in this call. Specify a vector of colors in the palette argument. This
is a another reason it's worth learning ways to generate a vector of colors. While
different packages could have very different shortcuts for specifying palettes from
color packages, they will generally always have a way to pass in a vector of
colors.

# Generate palettes from last time


library(RColorBrewer)
blups <- brewer.pal(9, "BuPu")

library(viridisLite)
vir <- viridis(9)
mag <- magma(9)

# Use the blups palette


tm_shape(prop_by_age) +
tm_raster("age_18_24", palette = blups) +
tm_legend(position = c("right", "bottom"))

# Use the vir palette


tm_shape(prop_by_age) +
tm_raster("age_18_24", palette = vir) +
tm_legend(position = c("right", "bottom"))

# Use the mag palette but reverse the order


tm_shape(prop_by_age) +
tm_raster("age_18_24", palette = rev(mag) ) +
tm_legend(position = c("right", "bottom"))

mag <- viridisLite::magma(7)

library(classInt)

# Create 5 "pretty" breaks with classIntervals()


classIntervals(values(prop_by_age[["age_18_24"]]), n = 5, style = "pretty" )

# Create 5 "quantile" breaks with classIntervals()


classIntervals(values(prop_by_age[["age_18_24"]]), n = 5, style = "quantile" )

# Use 5 "quantile" breaks in tm_raster()


tm_shape(prop_by_age) +
tm_raster("age_18_24", palette = mag, style = 'quantile') +
tm_legend(position = c("right", "bottom"))

# Create histogram of proportions


hist(values(prop_by_age[["age_18_24"]]))

# Use fixed breaks in tm_raster()


tm_shape(prop_by_age) +
tm_raster("age_18_24", palette = mag,
style = "fixed",, breaks = c(0.025, 0.05, 0.1, 0.2, 0.25, 0.3, 1))
# Save your plot to "prop_18-24.html"
save_tmap(filename = 'prop_18-24.html')

Call classIntervals() on values(prop_by_age[["age_18_24"]]) with n = 5 and


style = "pretty". See the problem? 130,770 of your grid cells end up in the first
bin.
Now call classIntervals() as above, but with style = "quantile".
Use the equisized bins by passing the n and style arguments into the
tm_raster() layer of your plot.
Make a histogram of values(prop_by_age[["age_18_24"]]). Where would you make
the breaks?
Create your own breaks in tm_raster() by specifying breaks = c(0.025, 0.05,
0.1, 0.2, 0.25, 0.3, 1).
Save your final plot as a leaflet plot using save_tmap() and the filename
"prop_18-24.html".

# Print migration
migration

# Diverging "RdGy" palette


red_gray <- brewer.pal(n = 7, "RdGy")

# Use red_gray as the palette


tm_shape(migration) +
tm_raster(palette = red_gray ) +
tm_legend(outside = TRUE, outside.position = c("bottom"))

# Add fixed breaks


tm_shape(migration) +
tm_raster(palette = red_gray, style = 'fixed', breaks = c(-5e6, -5e3, -5e2, -5e1,
5e1, 5e2, 5e3, 5e6) ) +
tm_legend(outside = TRUE, outside.position = c("bottom"))

library(raster)

# Plot land_cover
tm_shape(land_cover) +
tm_raster()

# Palette like the ggplot2 default


hcl_cols <- hcl(h = seq(15, 375, length = 9),
c = 100, l = 65)[-9]

# Use hcl_cols as the palette


tm_shape(land_cover) +
tm_raster(palette = hcl_cols)

# Examine levels of land_cover


levels(land_cover)

# A set of intuitive colors


intuitive_cols <- c(
"darkgreen",
"darkolivegreen4",
"goldenrod2",
"seagreen",
"wheat",
"slategrey",
"white",
"lightskyblue1"
)

# Use intuitive_cols as palette


tm_shape(land_cover) +
tm_raster(palette = intuitive_cols) +
tm_legend(position =c('left','bottom') )

Potrebbero piacerti anche