I'm running a Python script to find flight routes using a dataset with around 20k flights containing ~4000 flights each day. When profiling the code with cProfile, I print that the cumulative time recorded is much less than the query execution time. The discrepancy I'm seeing is:
Total function calls in cProfile: 0.468 seconds
Query executed time: 4.25759482383728 seconds
Here's the profilers output:
16499769 function calls in 0.468 seconds
Ordered by: cumulative time
ncalls tottime percall cumtime percall filename:lineno(function)
4167579 0.155 0.000 0.155 0.000 {method 'get' of 'dict' objects}
4059336 0.101 0.000 0.101 0.000 {method 'append' of 'collections.deque' objects}
4059365 0.099 0.000 0.099 0.000 {method 'popleft' of 'collections.deque' objects}
3972499 0.090 0.000 0.090 0.000 {built-in method builtins.len}
135618 0.019 0.000 0.019 0.000 /Users/<user>/PycharmProjects/route-parser/pure-python-9-8.py:109(<setcomp>)
95182 0.003 0.000 0.003 0.000 {method 'append' of 'list' objects}
1589 0.000 0.000 0.001 0.000 {method 'sort' of 'list' objects}
8316 0.000 0.000 0.000 0.000 /Users/<user>/PycharmProjects/route-parser/pure-python-9-8.py:91(<lambda>)
125 0.000 0.000 0.000 0.000 /Users/<user>/PycharmProjects/route-parser/pure-python-9-8.py:85(<lambda>)
127 0.000 0.000 0.000 0.000 {method 'values' of 'dict' objects}
30 0.000 0.000 0.000 0.000 /Users/<user>/PycharmProjects/route-parser/pure-python-9-8.py:95(<genexpr>)
2 0.000 0.000 0.000 0.000 {built-in method time.time}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
I understand that cProfile is not measuring wall time, but how can I bring these numbers closer together?
This is the program, eventually I'll not be reading from CSV and instead be reading from a DB.
import cProfile
import csv
import pstats
import time
from collections import defaultdict
from collections import deque
from datetime import timedelta
from io import StringIO
from dateutil.parser import parse
flights = []
with open('lambda.csv', 'r') as f:
reader = csv.reader(f)
next(reader) # Skip the header
for row in reader:
flight_id, origin, destination, start_datetime, end_datetime, capacity, equipment_type, tail_number = row
start_datetime = parse(start_datetime)
end_datetime = parse(end_datetime)
flights.append({
'flight_id': flight_id,
'origin': origin,
'destination': destination,
'start_datetime': start_datetime,
'end_datetime': end_datetime,
'capacity': int(capacity),
'equipment_type': equipment_type,
'tail_number': tail_number
})
# Initialize a connectivity map to store direct connections between airports
connectivity_map = defaultdict(set)
# Build the connectivity map from the flights data
for flight in flights:
connectivity_map[flight['origin']].add(flight['destination'])
def find_routes(origin, destination, start_datetime=None, end_datetime=None):
"""
Find possible flight routes from origin to destination within a specified time window.
Parameters:
- origin (str): The starting airport code.
- destination (str): The destination airport code.
- start_datetime (str): The start date and time in ISO format. If provided, searches 48 hours ahead from this time.
- end_datetime (str): The end date and time in ISO format. If provided, searches 48 hours before this time.
Returns:
None (prints the valid paths, number of paths, and query execution time).
"""
if start_datetime is not None:
start_timestamp = int(parse(start_datetime).timestamp())
end_timestamp = int(start_timestamp + timedelta(hours=48).total_seconds())
elif end_datetime is not None:
end_timestamp = int(parse(end_datetime).timestamp())
start_timestamp = int(end_timestamp - timedelta(hours=48).total_seconds())
else:
raise ValueError("Either start_datetime or end_datetime must be provided")
for flight in flights:
flight['start_timestamp'] = flight['start_datetime'].timestamp()
flight['end_timestamp'] = flight['end_datetime'].timestamp()
# Filter out the flights which don't fit within the specified time window
flights_filtered = [
flight for flight in flights if
start_timestamp <= flight['start_timestamp'] <= end_timestamp and
end_timestamp >= flight['end_timestamp']
]
pr = cProfile.Profile()
s = StringIO()
pr.enable()
start_time = time.time()
# Build a mapping from origin to destination with lists of flights that follow that route
next_flights_map = defaultdict(lambda: defaultdict(list))
for flight in flights_filtered:
next_flights_map[flight['origin']][flight['destination']].append(flight)
# Sort the flights for each route based on their start timestamp
for origin_data in next_flights_map.values():
for destination_data in origin_data.values():
destination_data.sort(key=lambda x: x['start_timestamp'])
valid_paths = []
# Initialize the exploration queue with the first set of flights from the origin
to_explore = deque([flight] for dest_flights in next_flights_map[origin].values() for flight in dest_flights)
while to_explore:
path = to_explore.popleft()
last_flight = path[-1]
if last_flight['destination'] == destination:
valid_paths.append(path)
continue
if len(path) >= 4:
continue
# Keep track of airports already visited in the current path to avoid loops
visited_airports = {flight['origin'] for flight in path}
# Find potential next destinations that haven't been visited yet
next_possible_destinations = connectivity_map[last_flight['destination']] - visited_airports
# For each potential next destination, explore the flights going there
for dest in next_possible_destinations:
potential_flights = next_flights_map[last_flight['destination']].get(dest, [])
for next_flight in potential_flights:
# Ensure the next flight starts after the last one ends
if next_flight['start_timestamp'] > last_flight['end_timestamp']:
new_path = path + [next_flight]
to_explore.append(new_path)
end_time = time.time()
pr.disable()
sortby = 'cumulative'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())
print(len(valid_paths))
print(f"Query executed in: {end_time - start_time} seconds")
# Test the function with a sample input
find_routes('MIA', 'ATL', start_datetime='2023-09-15T10:00:00Z')
My question is: Why is there such a significant discrepancy between the cProfile cumulative time and the execution time of the query? Am I missing something or not accounting for some overheads?
Also any improvements to the algorithm will be appreciated :D