diff --git a/post/2023/linesplit/index.qmd b/post/2023/linesplit/index.qmd index d4e5f1c..990c231 100644 --- a/post/2023/linesplit/index.qmd +++ b/post/2023/linesplit/index.qmd @@ -3,9 +3,9 @@ title: "Splitting and rasterising linestrings representing transport networks" # # Alternative title: # title: "Teaching R and Python for geographic data: shared experiences and lessons learned" author: - - name: Iwo Augustyniak - affiliation: University of Warsaw, Poland - orcid: 0000-0002-1057-3721 + - name: Iwo AugustyƄski + affiliation: Wroclaw Univeristy of Economics and Business, Poland + orcid: 0000-0003-2145-0484 - name: Robin Lovelace affiliation: University of Leeds, Active Travel England, UK orcid: 0000-0001-5679-6536 @@ -27,14 +27,6 @@ A key message if you're doing this kind of operation on medium to large datasets ## Pre-requisites ```{r} -# The problem -# -# There is a need to split road network with the grid. -# The only function for this I'm aware of is lwgeom::st_split -# This function is quite slow for dense grid (thousands of lines) and many roads (milions) -# -# The challange is how to optimize it or make more 'smarter' - library(sf) library(dplyr) ``` @@ -49,183 +41,296 @@ We used the following input datasets and parameters: ## The case study area -For the case study area we used a small area in Poznan around the Adam Mickiewicz University campus, as shown in the map below. +For the case study area we used main roads in Poznan, as shown in the map below. ```{r} -buffer_distance = 100 -# study_area = sf::st_read("https://github.com/Robinlovelace/opengeohub2023/raw/main/pois_buffer_simple.geojson") -geography_department = tmaptools::geocode_OSM("Collegium Geographicum, Poznan", as.sf = TRUE) -study_area = sf::st_buffer(geography_department$point, buffer_distance) -plot(study_area) -``` +roads_all = osmextract::oe_get("Poznan", extra_tags = "maxspeed", force_vectortranslate = TRUE) +roads = roads_all |> + filter(highway %in% c("motorway", "primary", "secondary", "tertiary")) |> + tibble::rowid_to_column() -## Transport network data - -```{r} -roads = osmextract::oe_get("Poznan", boundary = study_area, boundary_type = "clipsrc") -nrow(roads) plot(roads$geometry) ``` +The dataset consists of `r nrow(roads)` roads. ## Grid of lines Set the number of cells in the grid: ```{r} -n_cells = 100 -lines_number = round(sqrt(n_cells)) +nrows = 50 +ncols = 50 + ``` ```{r} # One way of creating it: -grid_polygons = sf::st_make_grid(study_area, n = c(10, 10)) -grid_polygons_lines = sf::st_cast(grid_polygons, "LINESTRING") -grid_vect = terra::vect(grid_polygons) -grid_rast = terra::rast(grid_vect, nrow = lines_number, ncol = lines_number) -terra::values(grid_rast) = runif(n_cells) +raster <- terra::rast(nrows = nrows, ncols = ncols, terra::ext(roads), vals = 1) ``` We'll define a function to create the grid, that contains lines that go horizontally and vertically, as follows: ```{r} #| code-fold: true -mkgrid <- function(x, lines_number = 10) { - # function creates grid based on the bbox of the sf object with defined number of lines - # inspired by https://grass.osgeo.org/grass82/manuals//v.mkgrid.html - lines_number = lines_number + 1 - pbox <- sf::st_bbox(x) - pbox <- data.frame(xmin = pbox[1], ymin = pbox[2], xmax = pbox[3], ymax = pbox[4]) - - x = seq.int(from = pbox$xmin, to = pbox$xmax, length.out = lines_number) - - # vertical lines - grid_v <- as.data.frame(x) |> - mutate(y = x, from = pbox$ymin, to = pbox$ymax) |> - as.matrix() - geom <- apply(grid_v, 1, function(x) st_as_text(st_linestring(matrix(x, ncol = 2))), simplify = T) - grid_v <- cbind(as.data.frame(grid_v), geom) |> st_as_sf(wkt = "geom", crs = "EPSG:4326") - - # horizontal lines - y = seq.int(from = pbox$ymin, to = pbox$ymax, length.out = lines_number) - grid_h <- as.data.frame(y) |> - mutate(x = y, from = pbox$xmin, to = pbox$xmax) |> select(from, to, x, y) |> - as.matrix() - geom <- apply(grid_h, 1, function(x) st_as_text(st_linestring(matrix(x, ncol = 2))), simplify = T) - grid_h <- cbind(as.data.frame(grid_h), geom) |> st_as_sf(wkt = "geom", crs = "EPSG:4326") - - # grid as one object - grid <- st_union(rbind(grid_h, grid_v)) - - list(grid = grid, grid_h = grid_h, grid_v = grid_v) +raster_to_grid <- function(raster){ + stars::st_as_stars(raster) |> st_as_sf() |> st_cast("POINT") |> # creates five points at the corners of the cell (first point is doubled to close the rectangle) + tibble::rownames_to_column() |> + filter(!rowname %in% as.character(1:(nrows*ncols))) |> # removes duplicated top-left point + distinct(geometry) |> # removes duplicated points from the adjacent cells + tibble::rowid_to_column() |> + group_by(rowid) |> # allows to rowwise execution of st_coordinates + mutate(x = st_coordinates(geometry)[1], + y = st_coordinates(geometry)[2]) |> + ungroup() |> + mutate( + position = case_when( # we need only borders + y == min(y) ~ "top", + y == max(y) ~ "bottom", + x == min(x) ~ "left", + x == max(x) ~ "right") + ) |> + filter(!is.na(position)) |> # preserves only points on edges + group_by(position) |> mutate(id = 1:n()) |> # numbers points in groups for joins + group_by(id) |> + mutate(position = case_when( # points must be grouped according to line types not edges + position %in% c("top", "bottom") ~ "v", + position %in% c("left", "right") ~ "h" + ) + ) |> + group_by(id, position) |> + summarise(id = mean(id)) |> # points to multipoint + st_cast("LINESTRING") |> # multipoint to linestring + ungroup() |> select(-id) # cleaning } ``` Construct the grid as follows: ```{r} -grid = mkgrid(study_area, lines_number = lines_number) -class(grid$grid) -plot(grid$grid) -``` - -We can check that the `grid` and `grid_polygon` objects are equivalent as follows: - -```{r} -plot(grid$grid, lwd = 9, col = "red") -plot(grid_polygons_lines, lwd = 3, col = "blue", add = TRUE) +grid <- raster_to_grid(raster) ``` -And for `terra` objects: +We can check that the grid matches roads: ```{r} -library(terra) -plot(grid_rast, lwd = 9) -plot(grid_vect, lwd = 3, col = NA, add = TRUE) +plot(grid$geometry, lwd = 3, col = "red") +plot(roads$geometry, lwd = 3, col = "blue", add = TRUE) ``` # Approaches +## 1. Segmentize roads to 100 m fragments ```{r} ##### 1. segmentize time_st_segmentize = system.time({ roads_segmented <- st_as_sf(roads, wkt = "geometry", crs = "EPSG:4326") |> - sf::st_segmentize(dfMaxLength = 10) + sf::st_segmentize(dfMaxLength = units::as_units("100 m")) }) +time_st_segmentize +``` +## 2. Simple crop roads to the grid with `lwgeom::st_split()` function +```{r} ##### 2. st_split -time_st_split <- system.time({roads_splitted <- lwgeom::st_split(tibble::rowid_to_column(roads), grid$grid)|> - sf::st_collection_extract("LINESTRING") |> - group_by(rowid) |> summarise(sum = mean(rowid)) |> st_cast("LINESTRING")}) +time_st_split <- system.time({roads_split <- lwgeom::st_split(roads, grid)|> + sf::st_collection_extract("LINESTRING")}) + +time_st_split +``` +## 3. Crop roads with single lines from the grid +```{r} ##### 3. split_lines -split_lines <- function(roads, blades_h, blades_v) { - blades_h <- collapse::rsplit(blades_h, 1:nrow(blades_h)) # data.frame to list - blades_v <- collapse::rsplit(blades_v, 1:nrow(blades_v)) # data.frame to list - tmp <- purrr::map(blades_h,\(x) lwgeom::st_split(tibble::rowid_to_column(roads), x)) |> # split roads line by line +split_lines1 <- function(rds, blades) { #splits roads with blades + rds <- mutate(rds, rowid = as.factor(rowid)) + rds <- collapse::rsplit(rds, ~ rowid) # data.frame with roads to list + tmp <- purrr::map(rds,\(x) lwgeom::st_split(x, blades)) |> # split roads line by line purrr::list_rbind() |> # resulting list to data.frame sf::st_as_sf() |> # data.frame to sf object - sf::st_collection_extract("LINESTRING") |> - group_by(rowid) |> summarise(sum = mean(rowid)) |> st_cast("MULTILINESTRING") # splitted road segments back to one road - - # the same but with vertical lines - purrr::map(blades_v,\(x) lwgeom::st_split(tmp, x)) |> - purrr::list_rbind() |> sf::st_as_sf() |> sf::st_collection_extract("LINESTRING") |> - group_by(rowid) |> summarise(sum = mean(rowid)) |> st_cast("MULTILINESTRING") # st_cast("LINESTRING") 'disappears' parts of multilines + sf::st_collection_extract("LINESTRING") } -time_split_lines <- system.time({roads_split_lines <- split_lines(roads, grid$grid_h, grid$grid_v)}) +time_split_lines1 <- system.time({roads_split_lines1 <- split_lines1(roads, grid)}) +time_split_lines1 +``` +## 4. Crop only roads that intersect with the grid +```{r} ##### 4. split_lines2 -split_lines2 <- function(x, h, v) { - blades <- rbind(h, v) +split_lines2 <- function(x, blades) { crosses <- st_filter(x, blades, .predicate = st_crosses) - others <- filter(roads, !osm_id %in% (crosses$osm_id)) + others <- filter(x, !osm_id %in% (crosses$osm_id)) splitted <- lwgeom::st_split(crosses, blades) rbind(splitted, others) |> sf::st_collection_extract("LINESTRING") } -time_split_lines2 <- system.time({roads_split_lines2 <- split_lines2(roads, grid$grid_h, grid$grid_v)}) +time_split_lines2 <- system.time({roads_split_lines2 <- split_lines2(roads, grid)}) +time_split_lines2 +``` -times_combined = rbind(time_st_segmentize, time_st_split, time_split_lines, time_split_lines2) +## 5. Crop using GRAS GIS +```{r} +#### 5. split with GRASS GIS +grass_split <- function(roads, grid){ + vgrid <- terra::vect(stars::st_as_stars(raster) |> st_as_sf()) + vroads <- terra::vect(roads) + + rgrass::write_VECT(x = vgrid, vname = "grid") + rgrass::write_VECT(vroads, "roads") + rgrass::stringexecGRASS("v.clip input=roads clip=grid output=clipped --overwrite") + rgrass::read_VECT("clipped") |> st_as_sf() +} -##### Visual verification +rgrass::initGRASS(gisBase = "/usr/lib/grass78", + home = tempdir(), + SG = raster, + override = TRUE) +time_grass <- system.time({roads_split_grass <- grass_split(roads, grid)}) +time_grass ``` +# Results + +```{r} +times_combined <- rbind(time_st_segmentize, + time_st_split, + time_split_lines1, + time_split_lines2, + time_grass) |> as.data.frame() |> + arrange(elapsed) + +knitr::kable(times_combined) + -We can visualise the results as follows. +``` +The results are not identical ```{r} -library(tmap) -tmap_mode("view") +cbind(method = c("Segmentation","Split", "Split 1", "Split 2", "Grass"), + rbind(nrow(roads_segmented), + nrow(roads_split), + nrow(roads_split_lines1), + nrow(roads_split_lines2), + nrow(roads_split_grass)) +) |> + knitr::kable(col.names = c("Method", "Number of rows")) +``` + + +Most of the approaches are sensitive to size of a dataset. +```{r} +roads_less = roads_all |> + filter(highway %in% c("motorway", "primary", "secondary")) + +time_st_segmentize_less = system.time({ + roads_segmented_less <- st_as_sf(roads_less, wkt = "geometry", crs = "EPSG:4326") |> + sf::st_segmentize(dfMaxLength = units::as_units("100 m")) +}) + +time_st_split_less <- system.time({roads_splitted_less <- lwgeom::st_split(roads_less, grid)|> + sf::st_collection_extract("LINESTRING")}) + +roads_less <- tibble::rowid_to_column(roads_less) +time_split_lines1_less <- system.time({roads_split_lines1_less <- split_lines1(roads_less, grid)}) + +time_split_lines2_less <- system.time({roads_split_lines2_less <- split_lines2(roads_less, grid)}) + +time_grass_less <- system.time({roads_split_grass_less <- grass_split(roads_less, grid)}) + -tm_shape(st_cast(roads_splitted$geometry[10], 'POINT')) + tm_dots(col = "red", scale = 4) + - tm_shape(st_cast(roads_split_lines$geometry[10], 'POINT')) + tm_dots(col = "blue", scale = 2) + - tm_shape(st_cast(roads$geometry[10], 'POINT')) + tm_dots(col = "green") + - tm_shape(grid$grid_v) + tm_lines(lty = "dotted") + - tm_shape(grid$grid_h) + tm_lines(lty = "dotted") + - tm_shape(grid$grid) + tm_lines() + - tm_basemap("Esri.WorldImagery") +times_combined <- rbind(time_st_segmentize, + time_st_segmentize_less, + time_st_split, + time_st_split_less, + time_split_lines1, + time_split_lines1_less, + time_split_lines2, + time_split_lines2_less, + time_grass, + time_grass_less + ) |> as.data.frame() |> + select(elapsed) +times_combined$n_roads <- rep(c(nrow(roads), nrow(roads_less)), 5) +times_combined$elapsed_per_100_roads <- times_combined$elapsed/times_combined$n_roads*100 + knitr::kable(arrange(times_combined, elapsed_per_100_roads)) ``` +Surprisingly except of st_split and st_segmentize all methods are faster for bigger dataset. The grass method has probably quite big overhead therefore it gains with size. Simple st_split is most sensitive to the number of lines to crop. Only segmentize is insensitive. + + # Rasterisation ```{r} -roads_split_lines$length = st_length(roads_split_lines) -rasterised_lines = terra::rasterize(roads_split_lines, grid_rast, field = "length", fun = sum) -plot(rasterised_lines) -plot(vect(roads), add = TRUE) +# raster grid to vector +grid_u <- stars::st_as_stars(raster) |> st_as_sf() %>% tibble::rowid_to_column() + + +rast_length <- function(x){ + st_join(grid_u, select({ x }, length), join = st_covers) %>% + group_by(rowid) %>% + summarise(sum = sum(length, na.rm = T)) %>% + terra::rasterize(raster, field = "sum") +} + +rast_split <- rast_length(roads_split) +rast_lines1 <- rast_length(roads_split_lines1) +rast_lines2 <- rast_length(roads_split_lines2) +rast_grass <- rast_length(roads_split_grass) ``` -# Results +Difference between all roads_split methods and roads_grass: +```{r} +terra::plot(rast_split- rast_grass, type = "interval", breaks = c(6000, 100, 0, -100, -6000), col = c("red", "green", "white", "green", "red")) +terra::plot(terra::vect(roads), add = TRUE) +``` +Differences in tabular form +```{r} +lngth <- function(x){ + st_join(grid_u, select({ x }, length), join = st_covers) %>% + group_by(rowid) %>% + summarise(sum = sum(length, na.rm = T)) %>% + filter(sum> units::as_units("0 m")) %>% + st_drop_geometry() +} + +sum_split <- lngth(roads_split) +# sum_lines1 <- lngth(roads_split_lines1) +# sum_lines2 <- lngth(roads_split_lines2) +sum_grass <- lngth(roads_split_grass) +x <- full_join(sum_split, sum_grass, by = "rowid") %>% mutate(dif = sum.x-sum.y) %>% + filter(dif > units::as_units("1 m") | dif < units::as_units("-1 m")) +filter(x, dif == min(x$dif)) +``` + +LEt's have a closer look at this grid cell: ```{r} -knitr::kable(times_combined) +library(tmap) +tmap_mode("view") +g <- select(roads_split_grass, osm_id, length) +g <- st_filter(g, f, .predicate = st_covered_by) + +s <- select(roads_split, osm_id, length) +s <- st_filter(s, f, .predicate = st_covered_by) + +tm_shape(grid_u %>% filter(rowid==1521)) + tm_polygons(col = "violet", alpha = 0.5) + + tm_shape(g, bbox = bb) +tm_lines(col = "red", scale = 3) + + tm_shape(s, bbox = bb) +tm_lines(col = "green") + +``` +Sum of roads' length is reported consistently with this picture: +```{r} +sum(s$length) +sum(g$length) ``` + + + # Discussion and next steps