I'm trying to optimize a query which sets the values of a column storing the order position number which is the position of the given order as viewed from the user's account, based on the order create time.
Setup looks like this:
CREATE TABLE orders (
department_id BIGINT UNSIGNED NOT NULL,
order_hash CHAR(32) NOT NULL,
account_hash CHAR(32) NOT NULL,
create_time INT UNSIGNED NOT NULL DEFAULT 0,
order_position_number INT UNSIGNED NOT NULL DEFAULT 0,
PRIMARY KEY (department_id, order_hash),
INDEX fk_orders_account_idx (department_id, account_hash),
INDEX create_time_idx (department_id ASC, create_time ASC),
INDEX position_number_idx (department_id, account_hash, create_time ASC),
INDEX test_wo_dep_position_number_idx (account_hash, create_time ASC)
) ENGINE = InnoDB DEFAULT CHARACTER SET = utf8mb4;
INSERT INTO orders
(department_id, order_hash, account_hash, create_time)
VALUES
(1, "order_hash_1", "account_hash_1", 100),
(1, "order_hash_2", "account_hash_2", 102),
(1, "order_hash_3", "account_hash_1", 104),
(1, "order_hash_4", "account_hash_1", 101),
(1, "order_hash_5", "account_hash_2", 101),
(1, "order_hash_6", "account_hash_1", 109),
(2, "order_hash_7", "account_hash_1", 107),
(2, "order_hash_8", "account_hash_1", 106);
The full query which will be run to populate the order_position_number column for the first department is:
INSERT INTO orders
(department_id,
order_hash,
account_hash,
order_position_number
)
SELECT department_id,
order_hash,
account_hash,
DENSE_RANK() OVER(PARTITION BY account_hash ORDER BY create_time) - 1 AS order_position_number
FROM orders
WHERE department_id = 1
ON DUPLICATE KEY UPDATE
order_position_number = VALUES(order_position_number);
When the INSERT statement is run the performance is very poor.
I've run EXPLAIN on the INSERT query but it is not very helpful. This is the output:
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "6.85"
},
"table": {
"insert": true,
"select_id": 1,
"table_name": "orders",
"access_type": "ALL"
}
}
}
When EXPLAIN is run on only the SELECT there is more information which indicates the order_position indices are not being used (see using_filesort):
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "6.85"
},
"windowing": {
"windows": [
{
"name": "<unnamed window>",
"using_filesort": true,
"filesort_key": [
"`account_hash`",
"`create_time`"
],
"functions": [
"dense_rank"
]
}
],
"cost_info": {
"sort_cost": "6.00"
},
"table": {
"table_name": "orders",
"access_type": "ref",
"possible_keys": [
"PRIMARY",
"fk_orders_account_idx",
"create_time_idx",
"position_number_idx",
"test_wo_dep_position_number_idx"
],
"key": "PRIMARY",
"used_key_parts": [
"department_id"
],
"key_length": "8",
"ref": [
"const"
],
"rows_examined_per_scan": 6,
"rows_produced_per_join": 6,
"filtered": "100.00",
"cost_info": {
"read_cost": "0.25",
"eval_cost": "0.60",
"prefix_cost": "0.85",
"data_read_per_join": "1K"
},
"used_columns": [
"department_id",
"order_hash",
"account_hash",
"create_time"
]
}
}
}
}
When EXPLAIN ANALYZE is run on the SELECT query, there is more information indicating that the index is not being used for the sort operation within the window.
-> Window aggregate: dense_rank() OVER (PARTITION BY orders.account_hash ORDER BY orders.create_time ) (actual time=0.043..0.046 rows=6 loops=1)
-> Sort: orders.account_hash, orders.create_time (cost=0.85 rows=6) (actual time=0.036..0.037 rows=6 loops=1)
-> Index lookup on orders using PRIMARY (department_id=1) (actual time=0.016..0.021 rows=6 loops=1)
When the target query is run on a large orders table the performance is very poor. The documentation regarding optimization of RANK window functions is very rare so any help would be greatly appreciated.