I'm working on a project to generate a global grid map with nodes representing ocean areas only, which will be used for sea route pathing on a global scale. The nodes should be connected with edges to their adjacent nodes, forming a walkable path where each node represents a point on the ocean.
Here are the parameters defining the grid:
LAT_MIN, LAT_MAX, LON_MIN, LON_MAX = -90, 90, -180, 180
LAT_STEP, LON_STEP = 0.3, 0.3
The grid map will be quite large, given the small step size and the global scale. I have a geopackage file ne_10m_land.gpkg that includes land polygons, and my goal is to exclude land nodes from the grid, leaving only the ocean nodes.
To inspect the contents of the geopackage file, I've printed the first few entries using geopandas:
import geopandas as gpd
ocean = gpd.read_file("routing/data/geopackages/ne_10m_land.gpkg")
print(ocean.head(5))
This results in the following output:
geometry
0 MULTIPOLYGON (((-0.00029 -71.49903, 0.01555 -7...
1 MULTIPOLYGON (((166.13697 -50.86435, 166.20525...
2 MULTIPOLYGON (((-78.78897 -33.60906, -78.78038...
3 MULTIPOLYGON (((163.98512 -20.04762, 163.98609...
4 MULTIPOLYGON (((134.70737 -6.58904, 134.72006 ...
The geometry column contains MULTIPOLYGON shapes representing land areas.
Questions:
- How can I optimize the land intersection checks to improve performance given the MULTIPOLYGON data structure?
- Are there any data structures or algorithms that can help reduce memory usage while still allowing me to query nodes and their neighbors efficiently?
- Would preprocessing the geopackage data into a more efficient format for this task be beneficial, and if so, what format and approach should I consider?
For full context, here is my code:
import os
import logging
import concurrent.futures
import traceback
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon
from concurrent.futures import ProcessPoolExecutor
from math import radians, cos, sin, asin, sqrt
from itertools import product
import numpy as np
import pickle
class Node:
def __init__(self, latitude, longitude, node_id, walkable=True):
self.latitude = latitude
self.longitude = longitude
self.id = node_id
self.walkable = walkable
self.neighbors = {} # Dictionary to hold neighbors and their distances
def add_neighbor(self, neighbor_node, distance):
self.neighbors[neighbor_node.id] = distance
def calculate_distance(self, other):
# Haversine formula to calculate the great circle distance between two points on the earth
lon1, lat1, lon2, lat2 = map(np.radians, [self.longitude, self.latitude, other.longitude, other.latitude])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
c = 2 * np.arcsin(np.sqrt(a))
r = 6371 # Radius of Earth in kilometers
return c * r
class GridMap:
script_dir = os.path.dirname(os.path.abspath(__file__))
land = gpd.read_file(os.path.join(script_dir, "data/geopackages/ne_10m_land.gpkg"))
land_sindex = land.sindex
def __init__(self, lat_min, lat_max, lon_min, lon_max, lat_step, lon_step):
self.nodes = {}
self.create_grid(lat_min, lat_max, lon_min, lon_max, lat_step, lon_step)
def create_grid(self, lat_min, lat_max, lon_min, lon_max, lat_step, lon_step):
node_id = 0
total_nodes = ((lat_max - lat_min) // lat_step) * ((lon_max - lon_min) // lon_step)
nodes_created = 0
# Create nodes at each step in the latitude and longitude range
for lat in np.arange(lat_min, lat_max, lat_step):
for lon in np.arange(lon_min, lon_max, lon_step):
point = Point(lon, lat) # Create a point with longitude, latitude
possible_matches_index = list(GridMap.land_sindex.intersection(point.bounds))
possible_matches = GridMap.land.iloc[possible_matches_index]
precise_matches = possible_matches[possible_matches.intersects(point)]
walkable = precise_matches.empty # A node is walkable if there are no precise land matches
node = Node(lat, lon, node_id=(lat, lon), walkable=walkable)
self.nodes[node.id] = node
node_id += 1
nodes_created += 1
progress = (nodes_created / total_nodes) * 100
if nodes_created % (total_nodes // 100) == 0: # Log every 1% progress
logging.info(f"Grid creation progress: {progress:.2f}%")
self.add_all_neighbors(lat_step, lon_step)
def create_node(self, lat, lon, node_id):
try:
point = Point(lon, lat) # Create a point with longitude, latitude
possible_matches_index = list(GridMap.land_sindex.intersection(point.bounds))
possible_matches = GridMap.land.iloc[possible_matches_index]
precise_matches = possible_matches[possible_matches.intersects(point)]
walkable = precise_matches.empty # A node is walkable if there are no precise land matches
node = Node(lat, lon, node_id=(lat, lon), walkable=walkable)
return node
except Exception as e:
logging.error("Failed to create node at (%s, %s): %s" % (lat, lon, e))
raise
def add_all_neighbors(self, lat_step, lon_step):
total_neighbors_checks = len(self.nodes) * 8 # Each node can have up to 8 neighbors
checks_done = 0
# Connect each node to its potential eight neighbors
for node_id, node in self.nodes.items():
lat, lon = node_id
for dlat in [-lat_step, 0, lat_step]:
for dlon in [-lon_step, 0, lon_step]:
# Skip the node itself
if dlat == 0 and dlon == 0:
continue
neighbor_id = (lat + dlat, lon + dlon)
if neighbor_id in self.nodes: # Check if the neighbor exists
neighbor_node = self.nodes[neighbor_id]
distance = node.calculate_distance(neighbor_node)
node.add_neighbor(neighbor_node, distance)
checks_done += 1 # Increment the checks counter here
progress = (checks_done / total_neighbors_checks) * 100
if checks_done % (total_neighbors_checks // 100) == 0: # Log every 1% progress
logging.info(f"Neighbor addition progress: {progress:.2f}%")
def create_grid_parallel(self, lat_range, lon_range, lat_step, lon_step):
args = ((lat, lon, (lat, lon)) for lat, lon in product(lat_range, lon_range))
with concurrent.futures.ProcessPoolExecutor() as executor:
# Submit tasks to the executor
future_to_latlon = {executor.submit(self.create_node, arg): arg for arg in args}
# As tasks complete, process the results
for future in concurrent.futures.as_completed(future_to_latlon):
latlon = future_to_latlon[future]
try:
node = future.result()
self.nodes[node.id] = node
except Exception as exc:
logging.error('%r generated an exception: %s' % (latlon, exc))
traceback.print_exc()
def save_to_file(self, filename):
with open(filename, 'wb') as f:
pickle.dump(self, f)
@staticmethod
def load_from_file(filename):
with open(filename, 'rb') as f:
return pickle.load(f)
# Usage with the specified variables
LAT_MIN, LAT_MAX, LON_MIN, LON_MAX = -90, 90, -180, 180
LAT_STEP, LON_STEP = 0.3, 0.3
# Usage with the specified variables
lat_range = np.arange(LAT_MIN, LAT_MAX, LAT_STEP)
lon_range = np.arange(LON_MIN, LON_MAX, LON_STEP)
# Check if the grid has already been created and saved to a file
grid_filename = 'grid_map.pkl'
if os.path.exists(grid_filename):
# Load the grid from the file
grid_map = GridMap.load_from_file(grid_filename)
else:
# Create the grid and save it to a file
grid_map = GridMap(LAT_MIN, LAT_MAX, LON_MIN, LON_MAX, LAT_STEP, LON_STEP)
grid_map.create_grid_parallel(lat_range, lon_range, LAT_STEP, LON_STEP)
grid_map.save_to_file(grid_filename)
I want to conclude with the final question being: is this possible? I know of the computational resources and time this is going to take, so if someone assures me that it's not worth my while and to limit my grid to way smaller bounds, then I will result in any recommendations you might provide me with.