Representing collections of set partitions in a relational database

55 views Asked by At

I have a set of elements. There are various ways I would like to partition the set, in the mathematical sense. A partition is defined as a collection of subsets such that every element is in exactly one subset.

I want to represent this situation in a relational database, and I want the relational constraints to enforce the mathematical definition of a partition.

The closest I have gotten is a schema like this:

CREATE TABLE element (
    id integer PRIMARY KEY
);

CREATE TABLE partition (
    id integer PRIMARY KEY
);

CREATE TABLE subset (
    id integer PRIMARY KEY,
    partition_id integer REFERENCES partition (id)
);

CREATE TABLE element_subset_mapping (
    element_id integer REFERENCES element (id) NOT NULL,
    partition_id integer REFERENCES partition (id) NOT NULL,
    subset_id integer REFERENCES subset (id) NOT NULL,
    CONSTRAINT uq_element_partition UNIQUE (element_id, partition_id)
);

This represents correctly that an element can only occur in one subset for a given partition. But there are at least a couple problems:

  • It does not enforce that, within each partition, every element belongs to some subset. I suppose I could take the semantics that a missing (element_id, partition_id) row indicates that the element belongs to a single element subset, but I would prefer to be explicit.
  • It does not enforce the requirement that a subset only belongs to one partition. That is, I could have a subset with (id, partition_id) = (22, 37), but still have a element_subset_mapping with (element_id, partition_id, subset_id) = (4005, 42, 22). This should not be allowed because subset 22 belongs to partition 37, not to partition 42.

It feels like there should be a canonical way to do this, but my searches have not yielded any answers.

1

There are 1 answers

2
JohnH On BEST ANSWER

The requirement that a subset belong to no more than one partition is already being enforced by the primary key constraint on subset; however, there is no enforcement that the combination of subset_id and partition_id in element_subset_mapping reference a corresponding pair in subset. The following DDL addresses these issues by making the combination of subset and partition unique in subset and combining the foreign keys on subset_id and partition_id in element_subset_mapping into a single foreign key referencing subset(id, partition_id):

CREATE TABLE element (
    id integer PRIMARY KEY
);

CREATE TABLE partition (
    id integer PRIMARY KEY
);

CREATE TABLE subset (
    id integer PRIMARY KEY,
    partition_id integer NOT NULL REFERENCES partition (id),
    CONSTRAINT subset_partition_uq UNIQUE (id, partition_id) 
);

CREATE TABLE element_subset_mapping (
    element_id integer REFERENCES element (id) NOT NULL,
    partition_id integer NOT NULL,
    subset_id integer NOT NULL,
    CONSTRAINT element_subset_mapping_partition_uq UNIQUE (element_id, partition_id),
    CONSTRAINT element_subset_mapping_subset_fk FOREIGN KEY (subset_id, partition_id) REFERENCES subset(id, partition_id)
);

The requirement that each element belong to exactly one subset within each partition is a more complex constraint to enforce because it involves rows from multiple tables. Let's begin by creating a trigger function that raises an exception if any elements are missing from any partitions:

CREATE OR REPLACE FUNCTION validate_all_elements_in_all_partitions() RETURNS TRIGGER AS
$FUNC$
  DECLARE
    element_id element.id%TYPE;
    partition_id partition.id%TYPE;
  BEGIN
    SELECT e.id, p.id
      INTO element_id, partition_id
      FROM element e
      CROSS JOIN partition p
      LEFT JOIN element_subset_mapping m
        ON e.id = m.element_id
          AND p.id = m.partition_id
      WHERE m.subset_id IS NULL
      LIMIT 1;

    IF element_id IS NOT NULL THEN
      RAISE EXCEPTION 'partition missing element'
        USING DETAIL = FORMAT('Partition (%1$s) is missing element (%2$s).', partition_id, element_id);
    END IF;
    RETURN NULL;
  END;
$FUNC$ LANGUAGE plpgsql;

This function can then be used to define constraint triggers on each of the tables that could cause the constraint to be violated: element, partition, and element_subset_mapping.

DROP TRIGGER IF EXISTS all_elements_in_all_partitions ON element;

CREATE CONSTRAINT TRIGGER all_elements_in_all_partitions
  AFTER DELETE OR INSERT OR UPDATE
  ON element
  DEFERRABLE INITIALLY DEFERRED
  FOR EACH ROW
  EXECUTE FUNCTION validate_all_elements_in_all_partitions();

DROP TRIGGER IF EXISTS all_elements_in_all_partitions ON partition;

CREATE CONSTRAINT TRIGGER all_elements_in_all_partitions
  AFTER DELETE OR INSERT OR UPDATE
  ON partition
  DEFERRABLE INITIALLY DEFERRED
  FOR EACH ROW
  EXECUTE FUNCTION validate_all_elements_in_all_partitions();

DROP TRIGGER IF EXISTS all_elements_in_all_partitions ON element_subset_mapping;

CREATE CONSTRAINT TRIGGER all_elements_in_all_partitions
  AFTER DELETE OR INSERT OR UPDATE
  ON element_subset_mapping
  DEFERRABLE INITIALLY DEFERRED
  FOR EACH ROW
  EXECUTE FUNCTION validate_all_elements_in_all_partitions();

An important aspect of these triggers is that they are DEFERRED, which means that they fire when the current transaction is committed instead of as each row or statement is executed. This allows a set of SQL commands to be executed within a transaction to achieve a state that is consistent with the relational requirements.

A requirement that wasn't mentioned in the original post is that all subsets must be non-empty. The following SQL addresses this using the same approach that was used to ensure that all elements are included in every partition:

CREATE OR REPLACE FUNCTION validate_no_empty_subsets() RETURNS TRIGGER AS
$FUNC$
  DECLARE
    partition_id partition.id%TYPE;
    subset_id subset.id%TYPE;
  BEGIN
    SELECT s.id, s.partition_id
      INTO subset_id, partition_id
      FROM subset s
      LEFT JOIN element_subset_mapping m
        ON s.id = m.subset_id
      WHERE m.subset_id IS NULL
      LIMIT 1;
      
    IF subset_id IS NOT NULL THEN
      RAISE EXCEPTION 'empty subset'
        USING DETAIL = FORMAT('Subset (%1$s) in partition (%2$s) is empty.', subset_id, partition_id);
    END IF;
    RETURN NULL;
  END;
$FUNC$ LANGUAGE plpgsql;

DROP TRIGGER IF EXISTS no_empty_subsets ON subset;

CREATE CONSTRAINT TRIGGER no_empty_subsets
  AFTER DELETE OR INSERT OR UPDATE
  ON subset
  DEFERRABLE INITIALLY DEFERRED
  FOR EACH ROW
  EXECUTE FUNCTION validate_no_empty_subsets();

DROP TRIGGER IF EXISTS no_empty_subsets ON element_subset_mapping;

CREATE CONSTRAINT TRIGGER no_empty_subsets
  AFTER DELETE OR INSERT OR UPDATE
  ON element_subset_mapping
  DEFERRABLE INITIALLY DEFERRED
  FOR EACH ROW
  EXECUTE FUNCTION validate_no_empty_subsets();