Improve insert when normalizing database in PostgreSQL

36 views Asked by At

I am given a dataset in a 3Gb CSV file, about 9,2 million rows. It has 18 columns, so I need to normalize it to different tables.

This data set stores employment history of employees in a company for around 30 years. Any changes like an employee having a new wage, new department, new position or having left a company will be recorded.

I have finished the DB design, but when I implement it in PostgreSQLI run into a problem when I cannot load the data from the dataset into table employment_history.

I create table full_data, then import the whole dataset from the CSV file into it. I will insert into other tables from table full_data. All tables run except the final one, employment_history. The query runs for 4 hours, and PostgreSQL shuts down. I tried EXPLAIN and creating an index on needed column; this reduces the costs from 8 millions to around 900,000, but the query still runs very long and PostgreSQL always shuts down.

Is the problem relating to my query or just because my computer is not strong enough?

What is my problem?

What are solutions?

I have a DELL Vostro with 8gb ram, 123gb SSD and 1Tb HDD. I have tried to put data in both SSD and HDD drivers but the result still the same

Below is my data dictionary, DB design and migration code.

db diagram

data dictionary

SET work_mem = '100MB';
SET hash_mem_multiplier = 8.0;

SHOW shared_buffers;
ALTER SYSTEM SET shared_buffers TO '2GB';

-- DELETE FROM full_data WHERE title IS NULL;

-- Create table employees and migrate data into
CREATE TABLE employees (
    employee_id SERIAL PRIMARY KEY,
    employee_code TEXT,
    first_name VARCHAR(30),
    last_name VARCHAR(30),
    fullname VARCHAR(50),
    gender CHAR(1),
    dob DATE,
    hire_date DATE,
    departured_date DATE
);
INSERT INTO employees (employee_code, first_name, last_name, fullname, gender, dob, hire_date, departured_date)
SELECT employee_code, first_name, last_name, fullname, gender, dob, hire_date, departured_date 
FROM full_data
GROUP BY employee_code, first_name, last_name, fullname, gender, dob, hire_date, departured_date;

-- Create table titles and migrate data into
CREATE TABLE titles (
    title_id SERIAL PRIMARY KEY,
    title VARCHAR(30)
);
INSERT INTO titles (title)
SELECT DISTINCT title FROM full_data;

-- Create table departments and migrate data into
CREATE TABLE departments (
    department_id SERIAL PRIMARY KEY,
    department VARCHAR(30)
);
INSERT INTO departments (department)
SELECT DISTINCT department_name FROM full_data;

-- Create table employee_title and migrate data into
CREATE TABLE employee_title (
    employee_title_id SERIAL PRIMARY KEY,
    employee_id INT REFERENCES employees(employee_id),
    title_id INT REFERENCES titles(title_id),
    effective_title_from_date DATE,
    effective_title_end_date DATE
);
INSERT INTO employee_title (
    employee_id, 
    title_id, 
    effective_title_from_date, 
    effective_title_end_date
)
SELECT employee_id, title_id, etfd, eted FROM full_data
JOIN titles 
    ON full_data.title = titles.title
JOIN employees 
    ON full_data.employee_code = employees.employee_code
GROUP BY employee_id, title_id, etfd, eted;

-- Create table employee_department and migrate data into
CREATE TABLE employee_department (
    employee_department_id SERIAL PRIMARY KEY,
    employee_id INT REFERENCES employees(employee_id),
    department_id INT REFERENCES departments(department_id),
    effective_department_from_date DATE,
    effective_department_end_date DATE
);
INSERT INTO employee_department (
    employee_id, 
    department_id, 
    effective_department_from_date, 
    effective_department_end_date
)
SELECT employee_id, department_id, edfd, eded FROM full_data
JOIN departments 
    ON full_data.department_name = departments.department
JOIN employees
    ON full_data.employee_code = employees.employee_code
GROUP BY employee_id, department_id, edfd, eded;

-- Create table employee_salary and migrate data into
CREATE TABLE employee_salary (
    employee_salary_id SERIAL PRIMARY KEY,
    employee_id INT REFERENCES employees(employee_id),
    wage INT,
    effective_salary_from_date DATE,
    effective_salary_end_date DATE
);
INSERT INTO employee_salary (
    employee_id, 
    wage, 
    effective_salary_from_date, 
    effective_salary_end_date
)
SELECT employee_id, wage, esfd, esed FROM full_data
JOIN employees 
    ON full_data.employee_code = employees.employee_code
GROUP BY employee_id, wage, esfd, esed;

-- Create table employment_history and migrate data into
CREATE TABLE employment_history (
    employment_history_id SERIAL PRIMARY KEY,
    employee_id INT REFERENCES employees(employee_id),
    employee_title_id INT REFERENCES employee_title(employee_title_id),
    employee_department_id INT REFERENCES employee_department(employee_department_id),
    employee_salary_id INT REFERENCES employee_salary(employee_salary_id),
    created_at DATE
);

-- Chosen index
CREATE INDEX emp_salary_date ON employee_salary (effective_salary_from_date, effective_salary_end_date, wage);
CREATE INDEX emp_tit_date ON employee_title (effective_title_from_date, effective_title_end_date);
CREATE INDEX emp_dept_date ON employee_department (effective_department_from_date, effective_department_end_date);
CREATE INDEX emp_code ON employees (employee_code);
CREATE INDEX full_data_salary_date ON full_data (esfd, esed, wage);
-- Chosen index

INSERT INTO employment_history (
    employee_id,
    employee_title_id, 
    employee_department_id, 
    employee_salary_id,
    created_at
)
SELECT 
    employees.employee_id, 
    employee_title_id,
    employee_department.employee_department_id, 
    employee_salary_id, created_at
FROM full_data
JOIN employee_salary
    ON full_data.esfd = employee_salary.effective_salary_from_date
    AND full_data.esed = employee_salary.effective_salary_end_date
    AND full_data.wage = employee_salary.wage
JOIN employees 
    ON full_data.employee_code = employees.employee_code
JOIN employee_title 
    ON full_data.etfd = employee_title.effective_title_from_date
    AND full_data.eted = employee_title.effective_title_end_date
JOIN employee_department 
    ON full_data.edfd = employee_department.effective_department_from_date
    AND full_data.eded = employee_department.effective_department_end_date
JOIN departments 
    ON departments.department = full_data.department_name
JOIN titles 
    ON titles.title = full_data.title;
0

There are 0 answers