From cd5d65e8c5b538ac5db6cece5d2cd5430c52f384 Mon Sep 17 00:00:00 2001 From: Shivam Tripathi Date: Thu, 3 May 2018 21:27:17 +0530 Subject: [PATCH 1/4] feat(migration/2018-05-03/up): Add changes to make use of Postgres FTS features --- migrations/2018-05-03/up.sql | 177 +++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 migrations/2018-05-03/up.sql diff --git a/migrations/2018-05-03/up.sql b/migrations/2018-05-03/up.sql new file mode 100644 index 0000000..d1f734b --- /dev/null +++ b/migrations/2018-05-03/up.sql @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2018 Shivam Tripathi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +BEGIN; + +/* + * This migration makes necessary changes in the database to avail full text search + * feature of Postgres on BookBrainz site. + */ + +CREATE EXTENSION pg_trgm; +CREATE EXTENSION unaccent; +CREATE EXTENSION btree_gin; + +-- Column to cache tokenized alias names +ALTER TABLE bookbrainz.alias + ADD COLUMN IF NOT EXISTS tokens TSVECTOR; + + +-- Cache tokens of all previously added alias names +UPDATE bookbrainz.alias + SET tokens = to_tsvector('pg_catalog.simple', unaccent(name)); + + +-- Function to unaccent the name row before tokenizing it +CREATE FUNCTION bookbrainz.token_creator() RETURNS trigger + AS $token_creator$ + BEGIN + new.tokens := to_tsvector('pg_catalog.simple', unaccent(new.name)); + RETURN new; + END +$token_creator$ LANGUAGE plpgsql; + + +-- Trigger to automatically unaccent and insert tokens of an added alias name +CREATE TRIGGER alias_token_update BEFORE INSERT OR UPDATE + ON bookbrainz.alias + FOR EACH ROW + EXECUTE PROCEDURE bookbrainz.token_creator(); + + +-- View to extract all the master entities +CREATE VIEW bookbrainz.master_entities AS + SELECT bbid as entity_id, default_alias_id as alias_id, 'creator' AS entity_type + FROM bookbrainz.creator + WHERE master=true + UNION + SELECT bbid as entity_id, default_alias_id as alias_id, 'edition' AS entity_type + FROM bookbrainz.edition + WHERE master=true + UNION + SELECT bbid as entity_id, default_alias_id as alias_id, 'publication' AS entity_type + FROM bookbrainz.publication + WHERE master=true + UNION + SELECT bbid as entity_id, default_alias_id as alias_id, 'publisher' AS entity_type + FROM bookbrainz.publisher + WHERE master=true + UNION + SELECT bbid as entity_id, default_alias_id as alias_id, 'work' AS entity_type + FROM bookbrainz.work + WHERE master=true; + + +/* + * The primary materialized view which holds all the relevant information about master entities. + * Token field is indexed for quick FTS. + * A unique index is created to help execute REFRESH CONCURRENTLY command. + * This unique indexing can be modified later when imports are added. + * Sample query: + * SELECT * FROM search_mv + * WHERE tokens @@ to_tsquery('pg_catalog.simple', 'Cha:* | Dar:*') + * ORDER BY ts_rank(search_mv.tokens, to_tsquery('english', 'Cha:* | Dar:*')) DESC; + */ + CREATE MATERIALIZED VIEW bookbrainz.search_mv AS + SELECT items.entity_id, alias.name, alias.tokens, items.entity_type + FROM bookbrainz.alias AS alias + JOIN ( + SELECT * + FROM bookbrainz.master_entities + ) AS items + ON alias.id = items.alias_id; + +CREATE INDEX fts_search_idx ON bookbrainz.search_mv USING gin(tokens); +CREATE UNIQUE INDEX unique_search_idx ON bookbrainz.search_mv (entity_id); + + +/* + * This view holds all the master entities with empty tokens, signifying unsupported + * alphabet. An index is added on the name field of search_mv to facilitate quicker + * ILIKE operations. + */ +CREATE VIEW bookbrainz.untokenized_names AS + SELECT * FROM bookbrainz.search_mv WHERE tokens = ''; + +CREATE INDEX untokenized_names_idx ON bookbrainz.search_mv USING gin(name); + + +/* + * This materialized view stores all unique tokens, used to suggest the next best query. + * Words have been trigram indexed for quick fuzzy search. + * A unique index has been created on words to help execute REFRESH CONCURRENTLY command. + * Sample query: + * select word from search_words_mv ORDER BY similarity(word, 'chales') DESC limit 10; + */ +CREATE MATERIALIZED VIEW bookbrainz.search_words_mv AS + SELECT word FROM ts_stat($$ + SELECT alias.tokens + FROM bookbrainz.alias AS alias + JOIN ( + SELECT alias_id + FROM bookbrainz.master_entities + ) AS items + ON alias.id = items.alias_id; + $$); + +CREATE INDEX search_words_mv_idx ON search_words_mv USING gin(word gin_trgm_ops); +CREATE UNIQUE INDEX unique_word_idx ON bookbrainz.search_words_mv (word); + + +-- Refresh search_mv and search_words_mv materialized views +CREATE FUNCTION bookbrainz.refresh_mv() RETURNS TRIGGER + AS $refresh_mv$ + BEGIN + REFRESH MATERIALIZED VIEW CONCURRENTLY bookbrainz.search_mv; + REFRESH MATERIALIZED VIEW CONCURRENTLY bookbrainz.search_words_mv; + RETURN NULL; + END +$refresh_mv$ LANGUAGE plpgsql; + + +/* + * Triggers which call refresh_mv function to refresh all materialized views + * upon any update or addition of entities. + */ +CREATE TRIGGER mv_creator_update AFTER INSERT OR UPDATE + ON bookbrainz.creator_header FOR EACH ROW EXECUTE PROCEDURE + bookbrainz.refresh_mv(); + +CREATE TRIGGER mv_edition_update AFTER INSERT OR UPDATE + ON bookbrainz.edition_header FOR EACH ROW EXECUTE PROCEDURE + bookbrainz.refresh_mv(); + +CREATE TRIGGER mv_publication_update AFTER INSERT OR UPDATE + ON bookbrainz.publication_header FOR EACH ROW EXECUTE PROCEDURE + bookbrainz.refresh_mv(); + +CREATE TRIGGER mv_publisher_update AFTER INSERT OR UPDATE + ON bookbrainz.publisher_header FOR EACH ROW EXECUTE PROCEDURE + bookbrainz.refresh_mv(); + +CREATE TRIGGER mv_work_update AFTER INSERT OR UPDATE + ON bookbrainz.work_header FOR EACH ROW EXECUTE PROCEDURE + bookbrainz.refresh_mv(); + + +ALTER MATERIALIZED VIEW IF EXISTS bookbrainz.search_mv OWNER TO bookbrainz; +ALTER MATERIALIZED VIEW IF EXISTS bookbrainz.search_words_mv OWNER TO bookbrainz; +ALTER VIEW IF EXISTS bookbrainz.master_entities OWNER TO bookbrainz; +ALTER VIEW IF EXISTS bookbrainz.untokenized_names OWNER TO bookbrainz; + +END; From 3b83958051484f0e14db26d5eccd2078dafd9688 Mon Sep 17 00:00:00 2001 From: Shivam Tripathi Date: Fri, 4 May 2018 22:37:50 +0530 Subject: [PATCH 2/4] feat(migration/2018-05-03/down): Bring down features added for Postgres FTS feature --- migrations/2018-05-03/down.sql | 43 ++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 migrations/2018-05-03/down.sql diff --git a/migrations/2018-05-03/down.sql b/migrations/2018-05-03/down.sql new file mode 100644 index 0000000..28af83f --- /dev/null +++ b/migrations/2018-05-03/down.sql @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2018 Shivam Tripathi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +BEGIN; + +DROP VIEW IF EXISTS bookbrainz.untokenized_names; +DROP MATERIALIZED VIEW IF EXISTS bookbrainz.search_mv; +DROP MATERIALIZED VIEW IF EXISTS bookbrainz.search_words_mv; +DROP VIEW IF EXISTS bookbrainz.master_entities; + +DROP TRIGGER IF EXISTS alias_token_update ON bookbrainz.alias; + +DROP TRIGGER IF EXISTS mv_publisher_update ON bookbrainz.publisher_header; +DROP TRIGGER IF EXISTS mv_creator_update ON bookbrainz.creator_header; +DROP TRIGGER IF EXISTS mv_work_update ON bookbrainz.work_header; +DROP TRIGGER IF EXISTS mv_edition_update ON bookbrainz.edition_header; +DROP TRIGGER IF EXISTS mv_publication_update ON bookbrainz.publication_header; + +DROP FUNCTION IF EXISTS bookbrainz.token_creator; +DROP FUNCTION IF EXISTS bookbrainz.refresh_mv cascade; + +ALTER TABLE bookbrainz.alias DROP COLUMN tokens; + +DROP EXTENSION IF EXISTS pg_trgm; +DROP EXTENSION IF EXISTS unaccent; +DROP EXTENSION IF EXISTS btree_gin; + +END; From a77295f9e414a2b5f4e19bd728313cc08ab44b76 Mon Sep 17 00:00:00 2001 From: Shivam Tripathi Date: Fri, 4 May 2018 20:44:02 +0530 Subject: [PATCH 3/4] feat(schema): Make changes to schema to set up Postgres FTS * Add extentions pg_trgm, unaccent, btree_gin repectively for trigram fuzzy search, unaccenting strings and using gin index. * Add column tokens TSVECTOR to alias table to add tokenized names. * Add view master_entities to extract all the master entities. * Add materialized view search_mv holding all the search relevant information about master entities. * Add materialized view search_words_mv for all unique tokens (used to suggest the next best query). * Index fts_search_idx on materialized view search_mv column tokens to for quick FTS. * Add view untokenized_names holding all master entities with empty tokens, signifying unsupported names by to_tsvector. * Index untokenized_names_idx on materialized view search_mv column names to aid quick view untokenized_names query. * Unique indexes on search_mv and search_words_mv to aid REFRESH CONCURRENTLY command. --- schemas/bookbrainz.sql | 68 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/schemas/bookbrainz.sql b/schemas/bookbrainz.sql index e9a781e..12c0ab8 100644 --- a/schemas/bookbrainz.sql +++ b/schemas/bookbrainz.sql @@ -1,5 +1,9 @@ BEGIN; +CREATE EXTENSION pg_trgm; +CREATE EXTENSION unaccent; +CREATE EXTENSION btree_gin; + CREATE TYPE bookbrainz.lang_proficiency AS ENUM ( 'BASIC', 'INTERMEDIATE', @@ -439,6 +443,7 @@ CREATE TABLE bookbrainz.alias ( name TEXT NOT NULL CHECK (name <> ''), sort_name TEXT NOT NULL CHECK (sort_name <> ''), language_id INT, + tokens TSVECTOR, "primary" BOOLEAN NOT NULL DEFAULT FALSE ); ALTER TABLE bookbrainz.alias ADD FOREIGN KEY (language_id) REFERENCES musicbrainz.language (id) DEFERRABLE; @@ -676,4 +681,67 @@ CREATE VIEW bookbrainz.publication AS LEFT JOIN bookbrainz.alias_set als ON pcd.alias_set_id = als.id WHERE e.type = 'Publication'; +-- View to extract all the master entities +CREATE VIEW bookbrainz.master_entities AS + SELECT bbid as entity_id, default_alias_id as alias_id, 'creator' AS entity_type + FROM bookbrainz.creator + WHERE master=true + UNION + SELECT bbid as entity_id, default_alias_id as alias_id, 'edition' AS entity_type + FROM bookbrainz.edition + WHERE master=true + UNION + SELECT bbid as entity_id, default_alias_id as alias_id, 'publication' AS entity_type + FROM bookbrainz.publication + WHERE master=true + UNION + SELECT bbid as entity_id, default_alias_id as alias_id, 'publisher' AS entity_type + FROM bookbrainz.publisher + WHERE master=true + UNION + SELECT bbid as entity_id, default_alias_id as alias_id, 'work' AS entity_type + FROM bookbrainz.work + WHERE master=true; + +-- Materialized view holding all the search relevant information about master entities +CREATE MATERIALIZED VIEW bookbrainz.search_mv AS + SELECT items.entity_id, alias.name, alias.tokens, items.entity_type + FROM bookbrainz.alias AS alias + JOIN ( + SELECT * + FROM bookbrainz.master_entities + ) AS items + ON alias.id = items.alias_id; + +-- Index on token field of materialized view search_mv for quick FTS +CREATE INDEX fts_search_idx ON bookbrainz.search_mv USING gin(tokens); + +-- Unique index on materialized view search_mv created to help execute REFRESH CONCURRENTLY command +CREATE UNIQUE INDEX unique_search_idx ON bookbrainz.search_mv (entity_id); + +-- Index on name field of materialized view search_mv to facilitate quick matching operations +CREATE INDEX untokenized_names_idx ON bookbrainz.search_mv USING gin(name); + +-- View holding all master entities with empty tokens, signifying unsupported names by to_tsvector +CREATE VIEW bookbrainz.untokenized_names AS + SELECT * FROM bookbrainz.search_mv WHERE tokens = ''; + +-- Materialized view for all unique tokens (used to suggest the next best query) +CREATE MATERIALIZED VIEW bookbrainz.search_words_mv AS + SELECT word FROM ts_stat($$ + SELECT alias.tokens + FROM bookbrainz.alias AS alias + JOIN ( + SELECT alias_id + FROM bookbrainz.master_entities + ) AS items + ON alias.id = items.alias_id; + $$); + +-- Trigram index on materialized view search_words_mv for quick fuzzy search. +CREATE INDEX search_words_mv_idx ON search_words_mv USING gin(word gin_trgm_ops); + +-- Unique index on materialized view search_words_mv to help execute REFRESH CONCURRENTLY command +CREATE UNIQUE INDEX unique_word_idx ON bookbrainz.search_words_mv (word); + COMMIT; From a577d12b8eb86b2c89cec8e82c15052586729421 Mon Sep 17 00:00:00 2001 From: Shivam Tripathi Date: Fri, 4 May 2018 22:03:07 +0530 Subject: [PATCH 4/4] feat(triggers): Add necessary triggers to aid Postgres FTS * Add function token_creator to tokenized name and enter it as token field in alias table * Add function refresh_mv which refreshes all materialized views linked with FTS - search_mv and search_words_mv * Add triggers for insert/update on each entity_header to call the refresh_mv function each time a new entity is inserted or updated (master revision). * Add trigger for insert on alias to call function token_creator. --- scripts/create_triggers.sql | 72 +++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/scripts/create_triggers.sql b/scripts/create_triggers.sql index 63066bb..4d1a0bf 100644 --- a/scripts/create_triggers.sql +++ b/scripts/create_triggers.sql @@ -229,6 +229,26 @@ CREATE OR REPLACE FUNCTION bookbrainz.process_publication() RETURNS TRIGGER END; $process_publication$ LANGUAGE plpgsql; +CREATE FUNCTION bookbrainz.token_creator() RETURNS trigger + AS $token_creator$ + BEGIN + new.tokens := to_tsvector('pg_catalog.simple', unaccent(new.name)); + RETURN new; + END +$token_creator$ LANGUAGE plpgsql; + +-- Function refreshes search_mv and search_words_mv materialized views +CREATE FUNCTION bookbrainz.refresh_mv() RETURNS TRIGGER + AS $refresh_mv$ + BEGIN + RAISE NOTICE 'Refresh Fires!'; + REFRESH MATERIALIZED VIEW CONCURRENTLY bookbrainz.search_mv; + REFRESH MATERIALIZED VIEW CONCURRENTLY bookbrainz.search_words_mv; + RAISE NOTICE 'Refreshing ...'; + RETURN NULL; + END +$refresh_mv$ LANGUAGE plpgsql; + BEGIN; @@ -269,3 +289,55 @@ CREATE TRIGGER process_publication FOR EACH ROW EXECUTE PROCEDURE bookbrainz.process_publication(); COMMIT; + +-- Trigger to unaccent and create tokens of alias name +CREATE TRIGGER alias_token_update BEFORE INSERT OR UPDATE + ON bookbrainz.alias FOR EACH ROW EXECUTE PROCEDURE + bookbrainz.token_creator(); + +COMMIT; + +BEGIN; + +-- Trigger to refresh views upon addition/update of creator +CREATE TRIGGER mv_creator_update AFTER INSERT OR UPDATE + ON bookbrainz.creator_header FOR EACH ROW EXECUTE PROCEDURE + bookbrainz.refresh_mv(); + +COMMIT; + +BEGIN; + +-- Trigger to refresh views upon addition/update of edition +CREATE TRIGGER mv_edition_update AFTER INSERT OR UPDATE + ON bookbrainz.edition_header FOR EACH ROW EXECUTE PROCEDURE + bookbrainz.refresh_mv(); + +COMMIT; + +BEGIN; + +-- Trigger to refresh views upon addition/update of publication +CREATE TRIGGER mv_publication_update AFTER INSERT OR UPDATE + ON bookbrainz.publication_header FOR EACH ROW EXECUTE PROCEDURE + bookbrainz.refresh_mv(); + +COMMIT; + +BEGIN; + +-- Trigger to refresh views upon addition/update of publisher +CREATE TRIGGER mv_publisher_update AFTER INSERT OR UPDATE + ON bookbrainz.publisher_header FOR EACH ROW EXECUTE PROCEDURE + bookbrainz.refresh_mv(); + +COMMIT; + +BEGIN; + +-- Trigger to refresh views upon addition/update of work +CREATE TRIGGER mv_work_update AFTER INSERT OR UPDATE + ON bookbrainz.work_header FOR EACH ROW EXECUTE PROCEDURE + bookbrainz.refresh_mv(); + +COMMIT;