Cumulative length of a LINESTRING using sf or terra

126 views Asked by At

is there a more efficient way to measure the cumulative length of a <LINESTRING> or <MULTILINESTRING>? In other words, I need to measure the distance between points from the start to the end. Currently, I've only come up with the idea of splitting the <LINESTRING> into individual segments and measuring the length for each one of them. It works, but it takes a lot of time because of the iterative approach. Is there any built-in method that exists?

Below is a reprex I came up with. It uses the Seine river from spData as an example. Even though the example below uses sf and geos packages, I'm happy to hear about any other spatial packages like terra, geos, or even rsgeo.

Cheers!

library(sf)
#> Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE
library(geos)
library(spData)
#> To access larger datasets in this package, install the spDataLarge
#> package with: `install.packages('spDataLarge',
#> repos='https://nowosad.github.io/drat/', type='source')`

# Example
lines <- seine[2, ]

# My foo
cumulative_length <- 
  function(input) {
    
    # Save CRS
    crs <- sf::st_crs(input)
    
    # Retrive coordinates
    lines_coo <- 
      sf::st_coordinates(input)
    
    # Count number of segments of linestring
    n <- nrow(lines_coo) - 1
    
    # Pre-allocate a list 
    lines_geos <- 
      vector(mode = "list", length = n)
    
    # Construct linestrings
    for (i in 1:n) {
      lines_geos[[i]] <- 
        geos::geos_make_linestring(lines_coo[i:(i+1),1], 
                                   lines_coo[i:(i+1),2], 
                                   crs = crs)
    }
    
    # Measure cumulative segment length
    lines_order <- 
      sapply(lines_geos, geos::geos_length) |> 
      append(0, 0) |> 
      cumsum()
    
    return(lines_order)
    
  }

bench::mark(cumulative_length(lines))
#> # A tibble: 1 × 6
#>   expression                    min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>               <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cumulative_length(lines)     13ms   13.7ms      72.9     244KB     109.

Created on 2024-03-23 with reprex v2.0.2

UPDATE

The desired output is exactly as follows, i.e. a monotonically increasing numeric vector of length from 0 to st_length(lines):

cumulative_length(lines) |> 
  head()
#> [1]    0.000 1716.196 3290.379 4824.087 6745.759 7446.660

4

There are 4 answers

2
Josep Pueyo On BEST ANSWER

Profiling your function, most of time is used by geos_make_linestring. So, assuming you can transform your data to a planar coordinates (UTM), you can skip that by calculating directly the distance between two points:

library(sf)
library(geos)
library(spData)
# Example
input <- seine[2, ] |> 
  st_transform(23032)

# My foo
cumulative_length <- function(input) {
  
  # Save CRS
  crs <- sf::st_crs(input)
  
  # Retrive coordinates
  lines_coo <- 
    sf::st_coordinates(input)
  
  # Count number of segments of linestring
  n <- nrow(lines_coo) - 1
  
  # Pre-allocate a list 
  lines_geos <- 
    vector(mode = "list", length = n)
  
  # Construct linestrings
  for (i in 1:n) {
    lines_geos[[i]] <- 
      geos::geos_make_linestring(lines_coo[i:(i+1),1], 
                                 lines_coo[i:(i+1),2], 
                                 crs = crs)
  }
  
  # Measure cumulative segment length
  lines_order <- 
    sapply(lines_geos, geos::geos_length) |> 
    append(0, 0) |> 
    cumsum()
  
  return(lines_order)
}

cartesian_dist <- function(x1, y1, x2, y2) {
  # √[(x2 − x1)2 + (y2 − y1)2]
  xmax <- max(x1, x2)
  xmin <- min(x1, x2)
  ymax <- max(y1, y2)
  ymin <- min(y1, y2)
  sqrt((xmax - xmin)^2 + (ymax - ymin)^2)
}

cumulative_length2 <- function(input) {
    
    # Retrive coordinates
    lines_coo <- sf::st_coordinates(input)
    
    # Count number of segments of linestring
    n <- nrow(lines_coo) - 1
    
    # Pre-allocate a list 
    dists <- numeric(n)
    
    # Construct linestrings
    for (i in 1:n) {
      dists[[i]] <- cartesian_dist(lines_coo[i, 1], lines_coo[i, 2], lines_coo[i+1, 1], lines_coo[i+1, 2])
    }
    
    # Measure cumulative segment length
    c(0, cumsum(dists))
    
}

microbenchmark::microbenchmark(
  cumulative_length(input),
  cumulative_length2(input),
  times = 100
)
#> Unit: milliseconds
#>                       expr     min      lq      mean   median       uq     max
#>   cumulative_length(input) 22.6733 23.5284 27.012571 24.42025 28.93890 53.6292
#>  cumulative_length2(input)  2.5730  2.7048  3.484514  2.82825  2.98925 21.8944
#>  neval
#>    100
#>    100

Created on 2024-03-23 with reprex v2.1.0

In case you can't convert to planar coordinates, you can also use the WGS84 projection and use the Haversine equation to calculate the distance between points instead of cartesian distance:

haversine_dist <- function(lon1, lat1, lon2, lat2){
  R <- 6371e3 # metres
  phi1 <- lat1 * pi/180 # radians
  phi2 = lat2 * pi/180 # radians
  delta_phi = (lat2-lat1) * pi/180 # radians
  delta_lambda = (lon2-lon1) * pi/180
  
  a <- sin(delta_phi/2) * sin(delta_phi/2) + cos(phi1) * cos(phi2) * sin(delta_lambda/2) * sin(delta_lambda/2)
  c <- 2 * atan2(sqrt(a), sqrt(1-a))
  
  R * c
}
2
Robert Hijmans On

You can use terra::perim to get the perimeter of polygons or to get the length of lines.

library(terra)
v <- vect(system.file("ex/lux.shp", package="terra"))
perim(v)
# [1] 117100.12  93477.28  84502.45  44919.14  85032.61  74708.05  57991.42  81203.93  74443.82  95564.74  80618.76 70810.67

This works for both planar and for angular (lon/lat) coordinates.

To get the length of line/polygon segments you need to first disaggregate them. Here illusrtated for the first polygon in v

x <- disagg(v[1,], segments=TRUE)
p <- perim(x)
head(p)
#[1] 1383.51779  350.05402  580.19502   93.65138  333.32716  798.43439

sum(p)
#[1] 117100.1

And with your example data you get the numbers you want

vect(lines) |> disagg(TRUE) |> perim() |> cumsum() |> head()
# [1] 1716.196 3290.379 4824.087 6745.759 7446.660 8303.283
0
atsyplenkov On

Thank you all for your quick answers; I like all of them. In every previous answer, there was an example of a huge speed increase compared to the original question. I played a bit with the code and found, to my knowledge, the most efficient approach. It may be useful for future reference. Here is the benchmark with the Rust function written with the extendr package. It outperforms {terra} by ca. 18 times and base R by ca. 9 times. The resulting vectors are identical (see check = TRUE).

  ## RUST
  # Wrap code from Josep Pueyo answer into RUST function
  code <- r"(
    use extendr_api::prelude::*;

    #[extendr]
    fn cartesian_distance(x1: f64, y1: f64, x2: f64, y2: f64) -> f64 {
      // Find max and min points
      let xmax = x1.max(x2);
      let xmin = x1.min(x2);
      let ymax = y1.max(y2);
      let ymin = y1.min(y2);

      // Calculate the distance between two points in 2D space
      let dx = xmax - xmin;
      let dy = ymax - ymin;
      (dx*dx + dy*dy).sqrt()
    }

    #[extendr]
    fn cartesian_distance_vector(x_vec: Vec<f64>, y_vec: Vec<f64>) -> Vec<f64> {

    // Pre-allocate result vector
    let mut cumulative_distances: Vec<f64> = Vec::with_capacity(x_vec.len());
    let n = x_vec.len() - 1;
    let mut cumulative_distance = 0.0;

    // Iterate over X-Y pairs
    for i in 0..n {

        let distance = cartesian_distance(x_vec[i], y_vec[i], x_vec[i+1], y_vec[i+1]);
        let distance = distance;

        // Accumulate cumulative distance
        cumulative_distance += distance;

        // Push cumulative distance to the result vector
        cumulative_distances.push(cumulative_distance);
    }

    cumulative_distances
}
)"
rextendr::rust_source(code = code)
#> ℹ build directory: 'C:/Users/TsyplenkovA/AppData/Local/Temp/RtmpquAnXb/file3d85de14e0f'
#> ✔ Writing 'C:/Users/TsyplenkovA/AppData/Local/Temp/RtmpquAnXb/file3d85de14e0f/target/extendr_wrappers.R'

## Base R
base_r <- function(X, Y){
  
  cartesian_dist <- function(x1, y1, x2, y2) {
    # √[(x2 − x1)2 + (y2 − y1)2]
    xmax <- max(x1, x2)
    xmin <- min(x1, x2)
    ymax <- max(y1, y2)
    ymin <- min(y1, y2)
    sqrt((xmax - xmin)^2 + (ymax - ymin)^2)
  }
  
  n <- length(X) - 1
  
  # Pre-allocate a list 
  dists <- numeric(n)
  
  # Construct linestrings
  for (i in 1:n) {
    dists[[i]] <- cartesian_dist(X[i], Y[i], X[i+1], Y[i+1])
  }
  
  # Measure cumulative segment length
  cumsum(dists)
  
}

## Terra
library(terra)
#> terra 1.7.71
library(sf) # sf package is only needed for reading seine
#> Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
lv <- vect(spData::seine[2, ])


## Benchmark
bench::mark(
  rust = cartesian_distance_vector(
    geom(lv)[, 3],
    geom(lv)[, 4]
  ),
  base_r = base_r(
    geom(lv)[, 3],
    geom(lv)[, 4]
  ),
  terra = {
    lv |>
      disagg(TRUE) |>
      perim() |>
      cumsum()
  },
  check = T,
  relative = T
)
#> # A tibble: 3 × 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 rust        1      1        16.9       10.1     4.71
#> 2 base_r      9.32   8.59      2.15      49.7     5.11
#> 3 terra      20.0   18.5       1          1       1

Created on 2024-03-26 with reprex v2.1.0

0
Philippe Grosjean On

The Rcpp version is even faster than the rust one:

library(Rcpp)
cppFunction('NumericVector cartesian_distance_cpp(NumericVector x_vec, NumericVector y_vec) {
  int n = x_vec.size() - 1;
  NumericVector cumulative_distances(n);
  double cumulative_distance = 0.0;

  for (int i = 0; i < n; i++) {
    double distance = sqrt(pow(x_vec[i] - x_vec[i+1], 2) + pow(y_vec[i] - y_vec[i+1], 2));
    cumulative_distance += distance;
    cumulative_distances[i] = cumulative_distance;
  }
  return cumulative_distances;
}')

On my Machine, rust is 1.6 times slower than Rcpp... but note that {terra} wins on the memory side by requiring 7 times less memory than rust, Rcpp or base.