library(sparklyr)
<- spark_connect(master = "local") sc
Practice Spark & R
library(dplyr)
<- copy_to(sc, iris)
iris_tbl <- copy_to(sc, nycflights13::flights, "flights") flights_tbl
<- flights_tbl %>%
delay group_by(tailnum) %>%
summarise(count = n(), dist = mean(distance), delay = mean(arr_delay)) %>%
filter(count > 20, dist < 2000, !is.na(delay)) %>%
collect()
# plot delays
library(ggplot2)
ggplot(delay, aes(dist, delay)) +
geom_point(aes(size = count), alpha = 1/2) +
geom_smooth() +
scale_size_area(max_size = 2)
spark_disconnect(sc)