I demonstrate improved binary classification by piping the results of a neural network into a local regression model (LOESS).
The data used is from the Rain in Australia project on Kaggle. View the link to see the structure and appearance of the data. What I do here is unique to what others have done and expands on the data manipulation (wrangling).
The primary challenge is to predict if the next day will experience more than 1mm of rain, given the information of the previous day.
It is possible to create another model that uses more days to predict the upcoming days. I could use a time series prediction approach similar to what I tried with the stock market. But I’m done playing with this dataset; there is more to explore.
From this exercise, I learned more about iterative imputation and improved my skills with deep learning.
I’ve foregone a clean example script for the exploration I performed. Important facts include that there are lots of values missing not at random, highly dependent on the weather station (Location) they came from. The observations where Rainfall is missing are identical to the ones with RainToday missing.
This is the script used in preparation for use with the neural network:
# Change of plans for prediction: Going to use black box models.
#
# I first used logistic regression models, but they were massive models and I
# decided it would be easier to switch to a neural network. The classical
# regression models were uninterpretable anyway, with hundreds of terms.
# (I could have also used a random forest, XGBoost, or similar.)
library(tidyverse)
library(lubridate)
library(mice)
# guess_max solves the issue of incorrect column type assignments.
rain <- read_csv("weatherAUS.csv", guess_max = 10000)
rain <- rain %>%
# Deriving periodic day of the year with trigonometric transformations
# and ignoring other aspects of the date, like the year.
mutate(Year = year(Date),
Day = yday(Date)) %>%
mutate(DateAngle = if_else(Year %% 4 == 0, Day/366, Day/365) * 2*pi) %>%
mutate(DateSine = sin(DateAngle),
DateCosine = cos(DateAngle)) %>%
select(-DateAngle, -Day, -Date, -Year) %>%
# Sunshine is almost 50% missing values, and not at random.
# Previously I removed it entirely with "select(-Sunshine) %>%",
# but this time I'll keep.
#
# Converting wind directions with similar periodic transformations
mutate(
WindGustSine = case_when(
WindGustDir == "N" ~ 1,
WindGustDir == "S" ~ -1,
WindGustDir %in% c("W", "E") ~ 0,
WindGustDir %in% c("NW", "NE") ~ sin(pi/4),
WindGustDir %in% c("SW", "SE") ~ sin(-pi/4),
WindGustDir %in% c("NNW", "NNE") ~ sin(3*pi/8),
WindGustDir %in% c("SSW", "SSE") ~ sin(-3*pi/8),
WindGustDir %in% c("WNW", "ENE") ~ sin(pi/8),
WindGustDir %in% c("WSW", "ESE") ~ sin(-pi/8)
),
WindGustCosine = case_when(
WindGustDir %in% c("N", "S") ~ 0,
WindGustDir == "W" ~ -1,
WindGustDir == "E" ~ 1,
WindGustDir %in% c("NW", "SW") ~ cos(3*pi/4),
WindGustDir %in% c("NE", "SE") ~ cos(pi/4),
WindGustDir %in% c("NNW", "SSW") ~ cos(5*pi/8),
WindGustDir %in% c("NNE", "SSE") ~ cos(3*pi/8),
WindGustDir %in% c("WNW", "WSW") ~ cos(7*pi/8),
WindGustDir %in% c("ENE", "ESE") ~ cos(pi/8)
),
WindSine9am = case_when(
WindDir9am == "N" ~ 1,
WindDir9am == "S" ~ -1,
WindDir9am %in% c("W", "E") ~ 0,
WindDir9am %in% c("NW", "NE") ~ sin(pi/4),
WindDir9am %in% c("SW", "SE") ~ sin(-pi/4),
WindDir9am %in% c("NNW", "NNE") ~ sin(3*pi/8),
WindDir9am %in% c("SSW", "SSE") ~ sin(-3*pi/8),
WindDir9am %in% c("WNW", "ENE") ~ sin(pi/8),
WindDir9am %in% c("WSW", "ESE") ~ sin(-pi/8)
),
WindCosine9am = case_when(
WindDir9am %in% c("N", "S") ~ 0,
WindDir9am == "W" ~ -1,
WindDir9am == "E" ~ 1,
WindDir9am %in% c("NW", "SW") ~ cos(3*pi/4),
WindDir9am %in% c("NE", "SE") ~ cos(pi/4),
WindDir9am %in% c("NNW", "SSW") ~ cos(5*pi/8),
WindDir9am %in% c("NNE", "SSE") ~ cos(3*pi/8),
WindDir9am %in% c("WNW", "WSW") ~ cos(7*pi/8),
WindDir9am %in% c("ENE", "ESE") ~ cos(pi/8)
),
WindSine3pm = case_when(
WindDir3pm == "N" ~ 1,
WindDir3pm == "S" ~ -1,
WindDir3pm %in% c("W", "E") ~ 0,
WindDir3pm %in% c("NW", "NE") ~ sin(pi/4),
WindDir3pm %in% c("SW", "SE") ~ sin(-pi/4),
WindDir3pm %in% c("NNW", "NNE") ~ sin(3*pi/8),
WindDir3pm %in% c("SSW", "SSE") ~ sin(-3*pi/8),
WindDir3pm %in% c("WNW", "ENE") ~ sin(pi/8),
WindDir3pm %in% c("WSW", "ESE") ~ sin(-pi/8)
),
WindCosine3pm = case_when(
WindDir3pm %in% c("N", "S") ~ 0,
WindDir3pm == "W" ~ -1,
WindDir3pm == "E" ~ 1,
WindDir3pm %in% c("NW", "SW") ~ cos(3*pi/4),
WindDir3pm %in% c("NE", "SE") ~ cos(pi/4),
WindDir3pm %in% c("NNW", "SSW") ~ cos(5*pi/8),
WindDir3pm %in% c("NNE", "SSE") ~ cos(3*pi/8),
WindDir3pm %in% c("WNW", "WSW") ~ cos(7*pi/8),
WindDir3pm %in% c("ENE", "ESE") ~ cos(pi/8)
)
) %>%
select(-c(WindGustDir, WindDir9am, WindDir3pm)) %>%
# Making location a quantitative feature.
# This requires a more complicated response surface than one-hot encoding,
# but (might) provide better results in the long run.
mutate(
Lat = case_when(
Location == "Albury" ~ -36.0737,
Location == "BadgerysCreek" ~ -33.8829,
Location == "Cobar" ~ -31.4958,
Location == "CoffsHarbour" ~ -30.2986,
Location == "Moree" ~ -29.4658,
Location == "Newcastle" ~ -32.9283,
Location == "NorahHead" ~ -33.2833,
Location == "NorfolkIsland" ~ -29.0408,
Location == "Penrith" ~ -33.7507,
Location == "Richmond" ~ -37.8230,
Location == "Sydney" ~ -33.8688,
Location == "SydneyAirport" ~ -33.9399,
Location == "WaggaWagga" ~ -35.1082,
Location == "Williamtown" ~ -32.8150,
Location == "Wollongong" ~ -34.4278,
Location == "Canberra" ~ -35.2809,
Location == "Tuggeranong" ~ -35.4244,
Location == "MountGinini" ~ -35.5294,
Location == "Ballarat" ~ -37.5622,
Location == "Bendigo" ~ -36.7570,
Location == "Sale" ~ -38.1026,
Location == "MelbourneAirport" ~ -37.6690,
Location == "Melbourne" ~ -37.8136,
Location == "Mildura" ~ -34.2080,
Location == "Nhil" ~ -36.3328,
Location == "Portland" ~ -38.3609,
Location == "Watsonia" ~ -37.7080,
Location == "Dartmoor" ~ -37.9144,
Location == "Brisbane" ~ -27.4698,
Location == "Cairns" ~ -16.9186,
Location == "GoldCoast" ~ -28.0167,
Location == "Townsville" ~ -19.2590,
Location == "Adelaide" ~ -34.9285,
Location == "MountGambier" ~ -37.8284,
Location == "Nuriootpa" ~ -34.4666,
Location == "Woomera" ~ -31.1656,
Location == "Albany" ~ -35.0269,
Location == "Witchcliffe" ~ -34.0261,
Location == "PearceRAAF" ~ -31.6676,
Location == "PerthAirport" ~ -31.9385,
Location == "Perth" ~ -31.9523,
Location == "SalmonGums" ~ -32.9815,
Location == "Walpole" ~ -34.9777,
Location == "Hobart" ~ -42.8821,
Location == "Launceston" ~ -41.4332,
Location == "AliceSprings" ~ -23.6980,
Location == "Darwin" ~ -12.4634,
Location == "Katherine" ~ -14.4521,
Location == "Uluru" ~ -25.3444),
Lon = case_when(
Location == "Albury" ~ 146.9135,
Location == "BadgerysCreek" ~ 150.7609,
Location == "Cobar" ~ 145.8389,
Location == "CoffsHarbour" ~ 153.1094,
Location == "Moree" ~ 149.8339,
Location == "Newcastle" ~ 151.7817,
Location == "NorahHead" ~ 151.5667,
Location == "NorfolkIsland" ~ 167.9547,
Location == "Penrith" ~ 150.6877,
Location == "Richmond" ~ 144.9980,
Location == "Sydney" ~ 151.2093,
Location == "SydneyAirport" ~ 151.1753,
Location == "WaggaWagga" ~ 147.3598,
Location == "Williamtown" ~ 151.8428,
Location == "Wollongong" ~ 150.8931,
Location == "Canberra" ~ 149.1300,
Location == "Tuggeranong" ~ 149.0888,
Location == "MountGinini" ~ 148.7723,
Location == "Ballarat" ~ 143.8503,
Location == "Bendigo" ~ 144.2794,
Location == "Sale" ~ 147.0730,
Location == "MelbourneAirport" ~ 144.8410,
Location == "Melbourne" ~ 144.9631,
Location == "Mildura" ~ 142.1246,
Location == "Nhil" ~ 141.6503,
Location == "Portland" ~ 141.6041,
Location == "Watsonia" ~ 145.0830,
Location == "Dartmoor" ~ 141.2730,
Location == "Brisbane" ~ 153.0251,
Location == "Cairns" ~ 145.7781,
Location == "GoldCoast" ~ 153.4000,
Location == "Townsville" ~ 146.8169,
Location == "Adelaide" ~ 138.6007,
Location == "MountGambier" ~ 140.7804,
Location == "Nuriootpa" ~ 138.9917,
Location == "Woomera" ~ 136.8193,
Location == "Albany" ~ 117.8837,
Location == "Witchcliffe" ~ 115.1003,
Location == "PearceRAAF" ~ 116.0292,
Location == "PerthAirport" ~ 115.9672,
Location == "Perth" ~ 115.8613,
Location == "SalmonGums" ~ 121.6438,
Location == "Walpole" ~ 116.7338,
Location == "Hobart" ~ 147.3272,
Location == "Launceston" ~ 147.1441,
Location == "AliceSprings" ~ 133.8807,
Location == "Darwin" ~ 130.8456,
Location == "Katherine" ~ 132.2715,
Location == "Uluru" ~ 131.0369)
) %>%
select(-Location) %>%
# Make numerical
mutate(RainToday = if_else(RainToday == "Yes", 1, 0),
RainTomorrow = if_else(RainTomorrow == "Yes", 1, 0))
# Imputed value flags to be provided to predictor
imputed_tags <- rain %>%
mutate(MinTemp_Imputed = if_else(is.na(MinTemp), 1, 0),
MaxTemp_Imputed = if_else(is.na(MaxTemp), 1, 0),
Rainfall_Imputed = if_else(is.na(Rainfall), 1, 0),
Evaporation_Imputed = if_else(is.na(Evaporation), 1, 0),
Sunshine_Imputed = if_else(is.na(Sunshine), 1, 0),
WindGustSpeed_Imputed = if_else(is.na(WindGustSpeed), 1, 0),
WindSpeed9am_Imputed = if_else(is.na(WindSpeed9am), 1, 0),
WindSpeed3pm_Imputed = if_else(is.na(WindSpeed3pm), 1, 0),
Humidity9am_Imputed = if_else(is.na(Humidity9am), 1, 0),
Humidity3pm_Imputed = if_else(is.na(Humidity3pm), 1, 0),
Pressure9am_Imputed = if_else(is.na(Pressure9am), 1, 0),
Pressure3pm_Imputed = if_else(is.na(Pressure3pm), 1, 0),
Cloud9am_Imputed = if_else(is.na(Cloud9am), 1, 0),
Cloud3pm_Imputed = if_else(is.na(Cloud3pm), 1, 0),
Temp9am_Imputed = if_else(is.na(Temp9am), 1, 0),
Temp3pm_Imputed = if_else(is.na(Temp3pm), 1, 0),
RainToday_Imputed = if_else(is.na(RainToday), 1, 0),
RainTomorrow_Imputed = if_else(is.na(RainTomorrow), 1, 0),
WindGustSine_Imputed = if_else(is.na(WindGustSine), 1, 0),
WindGustCosine_Imputed = if_else(is.na(WindGustCosine), 1, 0),
WindSine9am_Imputed = if_else(is.na(WindSine9am), 1, 0),
WindCosine9am_Imputed = if_else(is.na(WindCosine9am), 1, 0),
WindSine3pm_Imputed = if_else(is.na(WindSine3pm), 1, 0),
WindCosine3pm_Imputed = if_else(is.na(WindCosine3pm), 1, 0)) %>%
select(-(1:28))
rain_imputed <- rain %>%
# Iteratively imputing missing values with predictive mean matching
#
# I recommend learning more about `mice` and PMM. I still do not know
# everything about it.
# `mice` assumes that data is not missing at random, which is failed in this
# case. I created the `*_Imputed` columns above to indicate where values are
# not "real" and directly feed this info to the model.
# `m` determines the number of imputing instances to create. 10 are created
# here at random, providing a bit more diversity to the training data.
# As I have done it, the real values will appear multiple times (10) without
# change and this gives them more weight in training.
mice(m = 10, method = "pmm", maxit = 10) %>%
# Stack imputed datasets
complete("long")
# Stack tags
imputed_tags <- bind_rows(imputed_tags, imputed_tags, imputed_tags,
imputed_tags, imputed_tags, imputed_tags,
imputed_tags, imputed_tags, imputed_tags,
imputed_tags)
# Horizontally combine stacks
rain <- bind_cols(rain_imputed, imputed_tags)
# Impute RainToday from Rainfall
rain <- rain %>%
mutate(RainToday = if_else(Rainfall > 1, 1, 0))
write_csv(rain, "bb_rain.csv")
remove(rain_imputed, imputed_tags)
This script trains the model and saves predictions.
I didn’t do any standard scaling when training the neural network simply because I forgot to. This is the first time I’ve completely ignored the step. It shouldn’t make a difference in the final results, but scaling can make training faster. Skipping standardization resulted in a little less work, but isn’t something I would normally forgo.
library(tidyverse)
library(tensorflow)
library(keras)
rain <- read_csv("bb_rain.csv")
# Take single dataset to test
rain <- rain %>%
filter(.imp == 1, RainTomorrow_Imputed == 0) %>%
select(-c(.imp, .id))
# Null prediction
sum(rain$RainTomorrow == 0)/nrow(rain)
# Get sets
X <- rain %>% select(-RainTomorrow, -RainTomorrow_Imputed)
y <- rain %>% select(RainTomorrow)
train_rows <- sample(1:nrow(rain), round(0.85*nrow(rain)))
X_train <- X[train_rows,]
y_train <- y[train_rows,]
X_test <- X[-train_rows,]
y_test <- y[-train_rows,]
inputs <- layer_input(shape = c(50))
outputs <- inputs %>%
layer_dense(50, activation = 'elu') %>%
layer_dense(25, activation = 'elu') %>%
layer_dense(13, activation = 'elu') %>%
layer_dense(6, activation = 'elu') %>%
layer_dense(1, activation = 'sigmoid')
model <- keras_model(inputs = inputs, outputs = outputs)
model %>% compile(
optimizer = 'adam',
loss = 'binary_crossentropy',
metrics = c('accuracy'))
model %>% fit(
as.matrix(X_train),
as.matrix(y_train),
validation_data = list(
as.matrix(X_test),
as.matrix(y_test)),
#batch_size = 32,
epochs = 500,
callbacks = list(
callback_early_stopping(
monitor = 'val_loss',
mode = 'min',
patience = 25))) # Try amping this up more!
rain_complete <- read_csv("bb_rain.csv")
X_complete <- rain_complete %>%
select(-.imp, -.id, -RainTomorrow, -RainTomorrow_Imputed)
y_pred <- model %>% predict(as.matrix(X_complete))
rain_complete$RainTomorrow_Prediction <- as.vector(y_pred)
write_csv(rain_complete, "results.csv")
# *Cleanup the environment before proceeding.*
library(tidyverse)
rain <- read_csv("results.csv")
rain <- rain %>%
group_by(.id) %>%
mutate(RainTomorrow_Prediction = mean(RainTomorrow_Prediction)) %>%
filter(.imp == 1) %>%
ungroup()
rain %>%
filter(RainTomorrow_Imputed == 0) %>%
ggplot(aes(x = RainTomorrow_Prediction,
y = RainTomorrow == 1,
color = (RainTomorrow == 1))) +
geom_jitter(alpha = 0.1, width = 0) +
theme_light() +
labs(y = "More than 1mm of rain", x = "Model Estimated Probability") +
theme(legend.position = "none")