Roman Popat - The Data Lab
12/06/2017
rmnppt.com | @rmnppt | rmnppt |
@DataLabScotland
CivTech 1.0 Winners
library(tidyverse)
data_points
# A tibble: 646,015 x 6
id pothole_id sort x y z
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0.008300781 -0.3076630 -1.0217590
2 2 1 1 -0.007507324 -0.2951202 -0.9965515
3 3 1 2 -0.021316528 -0.2960968 -0.9669495
4 4 1 3 -0.019989014 -0.3049469 -0.9341583
5 5 1 4 -0.011123657 -0.3166504 -0.9088593
6 6 1 5 -0.009292603 -0.3313751 -0.8655090
7 7 1 6 -0.001800537 -0.3349457 -0.8573151
8 8 1 7 -0.011154175 -0.3375397 -0.8693085
9 9 1 8 -0.029937744 -0.3394012 -0.8852539
10 10 1 9 -0.085311890 -0.3360596 -0.8842010
# ... with 646,005 more rows
library(tidyverse)
potholes
# A tibble: 2,562 x 8
id verified latitude longitude gravity_threshold created_at
<dbl> <int> <dbl> <dbl> <dbl> <chr>
1 15 0 55.86916 -4.203986 0.0000000 2016-11-16 09:28:32
2 14 1 55.86940 -4.203184 0.0000000 2016-11-16 09:28:27
3 13 1 55.86943 -4.203083 0.0000000 2016-11-16 09:28:23
4 33 0 55.95801 -3.208395 1.1653390 2016-11-23 15:01:43
5 16 1 55.86904 -4.204424 0.0000000 2016-11-16 09:28:37
6 17 0 55.86891 -4.205028 1.0638300 2016-11-16 09:28:41
7 18 0 55.86865 -4.206044 1.0638300 2016-11-16 09:28:46
8 19 0 55.86859 -4.206300 1.0638300 2016-11-16 09:28:56
9 20 0 55.86810 -4.208178 0.4738878 2016-11-16 09:29:10
10 21 1 55.86807 -4.208325 0.4738878 2016-11-16 09:29:16
# ... with 2,552 more rows, and 2 more variables: updated_at <chr>,
# verified_model <int>
library(tidyverse)
segments <- data_points %>%
group_by(pothole_id) %>%
nest(x, y, z) %>%
left_join(., potholes, by = c("pothole_id" = "id")) %>%
mutate(verified = verified == 1) %>%
mutate(x = map(data, "x")) %>%
mutate(y = map(data, "y")) %>%
mutate(z = map(data, "z")) %>%
mutate(length = map(data, nrow)) %>%
filter(length == 250) %>% # segments vary in length
select(pothole_id, verified, x, y, z)
segments
# A tibble: 2,577 x 5
pothole_id verified x y z
<dbl> <lgl> <list> <list> <list>
1 1 NA <dbl [250]> <dbl [250]> <dbl [250]>
2 2 NA <dbl [250]> <dbl [250]> <dbl [250]>
3 3 NA <dbl [250]> <dbl [250]> <dbl [250]>
4 4 NA <dbl [250]> <dbl [250]> <dbl [250]>
5 5 NA <dbl [250]> <dbl [250]> <dbl [250]>
6 6 NA <dbl [250]> <dbl [250]> <dbl [250]>
7 7 NA <dbl [250]> <dbl [250]> <dbl [250]>
8 8 NA <dbl [250]> <dbl [250]> <dbl [250]>
9 9 NA <dbl [250]> <dbl [250]> <dbl [250]>
10 10 NA <dbl [250]> <dbl [250]> <dbl [250]>
# ... with 2,567 more rows
library(ggplot2)
ggplot(examples, aes(x = sort)) +
geom_line(aes(y = value, colour = axis), size = 1.5) +
facet_grid(id ~ verified) +
theme(text = element_text(size = 15))
library(TSclust)
tdiss <- diss(SERIES = segments$z, METHOD = "PACF")
str(tdiss)
Class 'dist' atomic [1:1955253] 1.143 0.625 0.405 0.298 0.633 ...
..- attr(*, "Size")= int 1978
..- attr(*, "call")= language as.dist.default(m = (distances))
..- attr(*, "Diag")= logi FALSE
..- attr(*, "Upper")= logi FALSE
library(Rtsne)
tsne <- Rtsne(tdiss)
tsne$results <- data.frame(
x = tsne$Y[,1],
y = tsne$Y[,2],
verified = segments$verified
)
tsne$results %>%
filter(!is.na(verified)) %>%
ggplot(aes(x = x, y = y)) +
geom_point(aes(colour = verified))
We now model the binary variable pothole / non-pothole as a function of the partial autocorrelation functions (PACFs) of the z accelerometer axis.
pothole ~ PACF + error
We will do this using library(caret)
and randomForest
.
load stuff in
library(caret)
library(randomForest)
d <- readRDS("data/pothole_pacf.rds") %>%
filter(!is.na(verified)) %>%
select(-pothole_id)
setup the control parameters
folds <- createFolds(
y = as.factor(d$verified),
k = 3
)
control <- trainControl(
method = "cv",
index = folds,
savePredictions = TRUE
)
tuning_grid <- data.frame(
mtry = seq(1:15)
)
fit the model
rf_fit <- train(
as.factor(verified) ~ .,
data = d,
method = "rf",
trControl = control,
ntree = 500,
tuneGrid = tuning_grid
)
rf_fit
Random Forest
856 samples
25 predictors
2 classes: 'FALSE', 'TRUE'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 285, 286, 285
Resampling results across tuning parameters:
mtry Accuracy Kappa
1 0.8586506 0.2533109
2 0.8755922 0.3960267
3 0.8861001 0.4842765
4 0.8831782 0.4746279
5 0.8901855 0.5226025
6 0.8872646 0.5173842
7 0.8878494 0.5238160
8 0.8860960 0.5219337
9 0.8901865 0.5383211
10 0.8890138 0.5407679
11 0.8843467 0.5260966
12 0.8860950 0.5291803
13 0.8855112 0.5302030
14 0.8825944 0.5223872
15 0.8825934 0.5254034
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 9.
see the results
ggplot(rf_fit)
rf_fit$finalModel$confusion
FALSE TRUE class.error
FALSE 78 65 0.45454545
TRUE 20 693 0.02805049
packages:
tidyverse
, especially dplyr
, purrr
, tidyr
ggplot2
Rtsne
TSdist
caret
randomForest
books:
rmnppt.com | @rmnppt | rmnppt |
Roman Popat