I have my master table of sources:
CREATE TABLE masterSources (
pk int NOT NULL AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(500)
);
INSERT INTO masterSources (name) VALUES
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.step_count.delta:com.google.android.gms:estimated_steps'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:watch:f8df280c:session_activity_segment'),
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.calories.expended:com.google.android.gms:from_activities');
Then, I receive thousands and thousands of sources that I can bulk into this table:
CREATE TABLE receivedSources (
name VARCHAR(500)
);
INSERT INTO receivedSources (name) VALUES
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'), #repeated
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:gew8df280c:top_level'), #new
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:ged8df280c:low_level'), #new
('derived:com.google.step_count.delta:com.google.android.gms:estimated_steps'),
('derived:com.google.step_count.delta:com.google.android.gms:estimated_steps'), #repeated
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:watch:f8df280c:session_activity_segment'),
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:iphone:1213084:top_level'), #repeated
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.calories.expended:com.google.android.gms:fromx2_activities'), #new
('derived:com.google.calories.expended:com.google.android.gms:from_activities');
What I want is to add the "new" sources into my masterSources table and finally generate a new table associating each source from receivedSources with its corresponding pk.
I can do it with a procedure:
CREATE PROCEDURE my_procedure()
BEGIN
DECLARE newName VARCHAR(500);
DECLARE done BOOLEAN DEFAULT FALSE;
# receivedSources MINUS masterSources
DECLARE cur CURSOR FOR SELECT n.name
FROM receivedSources n LEFT JOIN masterSources m ON (n.name=m.name)
WHERE m.pk IS NULL;
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET done = TRUE;
OPEN cur;
loop1: LOOP
FETCH cur INTO newName;
IF done THEN
LEAVE loop1;
END IF;
# Inserting new Sources in master
INSERT INTO masterSources (name) VALUES (newName);
END LOOP loop1;
CLOSE cur;
# Getting my desired table
CREATE TABLE newReceivedSources AS
SELECT m.pk,m.name FROM receivedSources n INNER JOIN masterSources m ON (n.name=m.name);
END
call my_procedure()
select * from newReceivedSources
My question: Is there a faster/smarter way to do it, considering how expensive is the join by varchar fields?

First add a unique key on
masterSources(Optional) to increase performance, also add a (non-unique) key on
receivedSourcesThen insert any new unique values into masterSources
Then retrieve your desired dataset