diff --git a/Dockerfile b/Dockerfile index 5b18e7dccf..ee36821fdb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,12 +11,15 @@ FROM ruby:3.2.5 # socat is just for binding ports within docker, not needed for the application RUN apt-get update && apt-get install -y --force-yes \ libsodium-dev libgmp3-dev libssl-dev \ - libpq-dev postgresql-client \ + postgresql-common \ nodejs \ socat \ texlive-latex-base texlive-fonts-recommended texlive-fonts-extra texlive-latex-extra \ - ; -# NB: Postgres client from Debian is 9.4 - not sure if this is acceptable +; + +# pg_dump requires that the client library >= the server (major) version +RUN yes | /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh +RUN apt-get install -y --force-yes libpq-dev postgresql-client-17 RUN mkdir /SAPI WORKDIR /SAPI diff --git a/config/environments/development.rb b/config/environments/development.rb index 60a1ef6c34..9384f66f48 100644 --- a/config/environments/development.rb +++ b/config/environments/development.rb @@ -115,7 +115,7 @@ } # Fixes error "Cannot render console from 172.22.0.1!" when in Docker - config.web_console.whitelisted_ips = [ + config.web_console.allowed_ips = [ '10.0.0.0/8', '172.16.0.0/12', '192.168.0.0/16' diff --git a/db/functions/squish_null/20250522113100.sql b/db/functions/squish_null/20250522113100.sql new file mode 100644 index 0000000000..487632b46e --- /dev/null +++ b/db/functions/squish_null/20250522113100.sql @@ -0,0 +1,12 @@ +CREATE OR REPLACE FUNCTION public.squish_null(TEXT) RETURNS TEXT + LANGUAGE SQL IMMUTABLE +AS $fn$ + SELECT + CASE WHEN public.squish($1) = '' + THEN NULL + ELSE public.squish($1) + END; +$fn$; + +COMMENT ON FUNCTION public.squish_null(TEXT) IS + 'Squishes whitespace characters in a string and returns null for empty string'; diff --git a/db/helpers/000_helpers.sql b/db/helpers/000_helpers.sql index 5393152a89..4d96c0ea3a 100644 --- a/db/helpers/000_helpers.sql +++ b/db/helpers/000_helpers.sql @@ -9,15 +9,6 @@ AS $FUNCTION$ ); $FUNCTION$; -CREATE OR REPLACE FUNCTION squish_null(TEXT) RETURNS TEXT - LANGUAGE SQL IMMUTABLE - AS $$ - SELECT CASE WHEN SQUISH($1) = '' THEN NULL ELSE SQUISH($1) END; - $$; - -COMMENT ON FUNCTION squish_null(TEXT) IS - 'Squishes whitespace characters in a string and returns null for empty string'; - -- This function previously had a different signature - ensure that the old version is gone DROP FUNCTION IF EXISTS full_name_with_spp(rank_name VARCHAR(255), full_name VARCHAR(255)); diff --git a/db/migrate/20250522113100_update_squish_null_function.rb b/db/migrate/20250522113100_update_squish_null_function.rb new file mode 100644 index 0000000000..ef6b7293f2 --- /dev/null +++ b/db/migrate/20250522113100_update_squish_null_function.rb @@ -0,0 +1,13 @@ +class UpdateSquishNullFunction < ActiveRecord::Migration[7.1] + def up + safety_assured do + execute function_sql('20250522113100', 'squish_null') + end + end + + def down + safety_assured do + execute function_sql('20150421071444', 'squish_null') + end + end +end diff --git a/db/structure.sql b/db/structure.sql index 3c2ec38069..1849748f65 100644 --- a/db/structure.sql +++ b/db/structure.sql @@ -6775,8 +6775,12 @@ COMMENT ON FUNCTION public.squish(text) IS 'Squishes whitespace characters in a CREATE FUNCTION public.squish_null(text) RETURNS text LANGUAGE sql IMMUTABLE AS $_$ - SELECT CASE WHEN SQUISH($1) = '' THEN NULL ELSE SQUISH($1) END; - $_$; + SELECT + CASE WHEN public.squish($1) = '' + THEN NULL + ELSE public.squish($1) + END; +$_$; -- diff --git a/docker-compose.yml b/docker-compose.yml index 95816b64eb..649ef24b47 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -45,8 +45,8 @@ services: MINIO_ROOT_USER: minioadmin MINIO_ROOT_PASSWORD: minioadmin - db: - container_name: sapi-db + db-pg10: + container_name: sapi-db-pg10 image: postgres:10 command: postgres -c max_wal_size=2GB healthcheck: @@ -66,6 +66,27 @@ services: POSTGRES_HOST_AUTH_METHOD: "trust" POSTGRES_DB: "sapi_development" + db-pg17: + container_name: sapi-db-pg17 + image: postgres:17 + command: postgres -c max_wal_size=2GB + healthcheck: + test: ["CMD", "pg_isready", "-U", "postgres"] + timeout: 45s + interval: 10s + retries: 10 + volumes: + - ./db_init:/docker-entrypoint-initdb.d + - ./db/structure.sql:/docker-entrypoint-initdb.d/sapi_schema.sql + - 'pg17data:/var/lib/postgresql/data' + ports: + - "${SAPI_CONTAINER_DB_PORT:-5417}:5432" + networks: + - sapi + environment: + POSTGRES_HOST_AUTH_METHOD: "trust" + POSTGRES_DB: "sapi_development" + rails: container_name: sapi-rails build: @@ -114,17 +135,18 @@ services: tty: true environment: MAILER_ADDRESS: sapi-mailcatcher - SAPI_DATABASE_HOST: sapi-db + SAPI_DATABASE_HOST: sapi-db-pg17 SAPI_DATABASE_USERNAME: postgres SAPI_DATABASE_PORT: 5432 SAPI_SIDEKIQ_REDIS_URL: redis://sapi-redis:6379/0 SAPI_SIDEKIQ_REDIS_CACHE_URL: redis://sapi-redis-cache:6380/0 - CAPTIVE_BREEDING_DATABASE_HOST: sapi-db + CAPTIVE_BREEDING_DATABASE_HOST: sapi-db-pg17 SAPI_S3_PORT: '${SAPI_CONTAINER_S3_PORT:-9000}' depends_on: - redis - redis_cache - - db + - db-pg10 + - db-pg17 - mailcatcher - minio deploy: @@ -138,7 +160,7 @@ services: stdin_open: true tty: true environment: - SAPI_DATABASE_HOST: sapi-db + SAPI_DATABASE_HOST: sapi-db-pg17 SAPI_SIDEKIQ_REDIS_URL: redis://sapi-redis:6379/0 # Defaults to blank; used by AppSignal: USER: "$USER" @@ -176,7 +198,8 @@ services: networks: - sapi depends_on: - - db + - db-pg10 + - db-pg17 - redis - redis_cache - mailcatcher @@ -185,14 +208,13 @@ services: volumes: *rails_volumes environment: MAILER_ADDRESS: sapi-mailcatcher - SAPI_DATABASE_HOST: sapi-db + SAPI_DATABASE_HOST: sapi-db-pg17 SAPI_DATABASE_USERNAME: postgres SAPI_DATABASE_PORT: 5432 - SAPI_SIDEKIQ_REDIS_URL: redis://sapi-redis:6379/0 SAPI_S3_PORT: '${SAPI_CONTAINER_S3_PORT:-9000}' + SAPI_SIDEKIQ_REDIS_URL: redis://sapi-redis:6379/0 SAPI_SIDEKIQ_REDIS_CACHE_URL: redis://sapi-redis-cache:6380/0 - CAPTIVE_BREEDING_DATABASE_HOST: sapi-db - + CAPTIVE_BREEDING_DATABASE_HOST: sapi-db-pg17 mailcatcher: container_name: sapi-mailcatcher image: sj26/mailcatcher @@ -203,6 +225,7 @@ services: volumes: pgdata: + pg17data: bundler_gems: redis_data: minio-data: diff --git a/lib/tasks/db_trim.rake b/lib/tasks/db_trim.rake index c75375267e..5393435cfd 100644 --- a/lib/tasks/db_trim.rake +++ b/lib/tasks/db_trim.rake @@ -1,7 +1,25 @@ +## +# In addition to deleting data, some consideration should be given to database +# maintenance: +# +# ANALYSE calculates summary stats about a table and its columns. These stats +# are used in query planning, so bad stats +# +# REINDEX is called because some large tables have gigabytes locked away in +# indexes, and REINDEX is the quickest way to free up that space. +# +# VACUUM can only reclaim space from dead tuples so often is not worth it. +# Furthermore space it reclaims is still reserved for the table, so is not +# released by the file system - for that VACUUM FULL is required, which requires +# a full table rewrite, which takes up additional disk space which may not be +# free. + namespace :db do desc 'Deletes historic and sensitive data, runs cleanup of temporary tables and rebuilds' task trim: [ :environment, + 'db:trim_ahoy', + 'db:trim_api_requests', 'db:common_names:cleanup', 'db:taxon_names:cleanup', 'db:trim_trade', @@ -11,29 +29,99 @@ namespace :db do 'db:trim_users', 'import:drop_import_tables', 'db:migrate:rebuild', - 'db:drop_temporary_tables' + 'db:drop_temporary_tables', + 'db:vacuum_full' ] + ## + # Drop all tables of the form /^trade_sandbox_\d+$/ + # and their associated views and indexes + task trim_trade_sandboxes: :environment do + sandbox_group_count = 50 + + sandbox_query = + <<-SQL.squish + SELECT count(*) FROM information_schema.tables + WHERE table_name LIKE 'trade_sandbox%' + AND table_name != 'trade_sandbox_template' + AND table_type != 'VIEW' + SQL + + sandbox_count = + ApplicationRecord.connection.execute( + sandbox_query + )[0]['count'].to_i + + ( + sandbox_count.to_f / sandbox_group_count + ).ceil.times do + ApplicationRecord.connection.execute <<-SQL.squish + DO $do$ + DECLARE + current_table_name TEXT; + BEGIN + FOR current_table_name + IN #{sandbox_query} + LIMIT #{sandbox_group_count} + LOOP + EXECUTE 'DROP TABLE ' || current_table_name || ' CASCADE'; + END LOOP; + RETURN; + END; + $do$; + SQL + end + end + task trim_trade: :environment do - puts 'Deleting old shipments' year = Date.today.year - 5 - ApplicationRecord.connection.execute "DELETE FROM trade_shipments WHERE year <= #{year}" - puts 'Clearing permit and annual report data' - ApplicationRecord.connection.execute 'UPDATE trade_shipments SET + + puts "Deleting shipments prior to #{year}" + ApplicationRecord.connection.execute <<-SQL.squish + DELETE FROM trade_shipments WHERE year <= #{year} + SQL + + ApplicationRecord.connection.execute 'ANALYSE trade_shipments;' + ApplicationRecord.connection.execute 'REINDEX TABLE trade_shipments;' + + puts 'Clearing permits' + # Note: where clause don't make unnecessary writes to rows + ApplicationRecord.connection.execute <<-SQL.squish + UPDATE trade_shipments SET import_permit_number = NULL, export_permit_number = NULL, origin_permit_number = NULL, - import_permits_ids = \'{}\'::INT[], - export_permits_ids = \'{}\'::INT[], - origin_permits_ids = \'{}\'::INT[], + import_permits_ids = '{}'::INT[], + export_permits_ids = '{}'::INT[], + origin_permits_ids = '{}'::INT[], trade_annual_report_upload_id = NULL, - sandbox_id = NULL' - puts 'Dropping sandboxes' - ApplicationRecord.connection.execute 'SELECT * FROM drop_trade_sandboxes()' - puts 'Truncating annual reports' - ApplicationRecord.connection.execute 'DELETE FROM trade_annual_report_uploads' + sandbox_id = NULL + WHERE origin_permit_number IS NOT NULL + OR export_permit_number IS NOT NULL + OR origin_permit_number IS NOT NULL + OR trade_annual_report_upload_id IS NOT NULL + OR sandbox_id IS NOT NULL + ; + SQL + puts 'Truncating permits' ApplicationRecord.connection.execute 'TRUNCATE trade_permits' + ApplicationRecord.connection.execute 'REINDEX TABLE trade_permits' + + ## + # drop_trade_sandboxes() does not work when there are many sandboxes + # + # puts 'Dropping sandboxes' + # ApplicationRecord.connection.execute 'SELECT * FROM drop_trade_sandboxes()' + + puts "Deleting annual reports uploaded prior to #{year}" + ApplicationRecord.connection.execute <<-SQL.squish + DELETE FROM trade_annual_report_uploads + WHERE updated_at <= '#{year}-01-01'; + SQL + + ApplicationRecord.connection.execute 'ANALYSE trade_annual_report_uploads;' + ApplicationRecord.connection.execute 'REINDEX TABLE trade_annual_report_uploads;' end task trim_listing_changes: :environment do @@ -47,28 +135,32 @@ namespace :db do ON lc.parent_id = nc_lc.id ), listing_changes_to_delete AS ( SELECT * FROM non_current_listing_changes - UNION + EXCEPT SELECT * FROM exceptions ), deleted_listing_distributions AS ( DELETE FROM listing_distributions USING listing_changes_to_delete lc WHERE lc.id = listing_distributions.listing_change_id - ), deleted_annotations AS ( - DELETE FROM annotations - USING listing_changes_to_delete lc - WHERE annotations.id = lc.annotation_id ), updated_original_id AS ( UPDATE listing_changes SET original_id = NULL FROM listing_changes_to_delete WHERE listing_changes.original_id = listing_changes_to_delete.id + ), updated_parent_id AS ( + UPDATE listing_changes + SET parent_id = NULL + FROM listing_changes_to_delete + WHERE listing_changes.parent_id = listing_changes_to_delete.id ) DELETE FROM listing_changes USING listing_changes_to_delete lc - WHERE lc.id = listing_changes.id + WHERE lc.id = listing_changes.id; SQL + puts 'Deleting old listing changes' ApplicationRecord.connection.execute sql + ApplicationRecord.connection.execute 'ANALYSE listing_changes;' + ApplicationRecord.connection.execute 'REINDEX TABLE listing_changes;' end task trim_trade_restrictions: :environment do @@ -76,6 +168,10 @@ namespace :db do WITH trade_restrictions_to_delete AS ( SELECT * FROM trade_restrictions WHERE NOT is_current + ), deleted_cites_suspension_confirmations AS ( + DELETE FROM cites_suspension_confirmations + USING trade_restrictions_to_delete + WHERE cites_suspension_confirmations.cites_suspension_id = trade_restrictions_to_delete.id ), deleted_restriction_purposes AS ( DELETE FROM trade_restriction_purposes USING trade_restrictions_to_delete @@ -96,29 +192,122 @@ namespace :db do ) DELETE FROM trade_restrictions USING trade_restrictions_to_delete tr - WHERE tr.id = trade_restrictions.id + WHERE tr.id = trade_restrictions.id; SQL + puts 'Deleting old trade restrictions' ApplicationRecord.connection.execute sql + ApplicationRecord.connection.execute 'ANALYSE trade_restrictions;' + ApplicationRecord.connection.execute 'REINDEX TABLE trade_restrictions;' end task trim_eu_decisions: :environment do - sql = 'DELETE FROM eu_decisions WHERE NOT is_current' puts 'Deleting old EU decisions' - ApplicationRecord.connection.execute sql + ApplicationRecord.connection.execute <<-SQL.squish + DELETE FROM eu_decisions + WHERE NOT eu_decisions.is_current AND NOT EXISTS ( + SELECT TRUE + FROM eu_decision_confirmations + WHERE eu_decision_id = eu_decisions.id + ); + SQL + + ApplicationRecord.connection.execute 'ANALYSE eu_decisions;' + ApplicationRecord.connection.execute 'REINDEX TABLE eu_decisions;' end task trim_users: :environment do - puts 'Clearing user data' + puts 'Pseudonymising user data' ApplicationRecord.connection.execute <<-SQL.squish - UPDATE users SET - name = 'user ' || users.id, - email = 'user.' || users.id || '@test.org' + UPDATE "users" u SET + "name" = 'User ' || u.id, + "email" = 'user.' || u.id || '@test.local', + "current_sign_in_ip" = '192.168.1.' || (u.id % 256), + "last_sign_in_ip" = '192.168.1.' || (u.id % 256) + WHERE "email" NOT LIKE '%@unep-wcmc.org' + AND "email" NOT LIKE '%@test.local' SQL end + task trim_ahoy: :environment do + puts 'Removing analytics data' + + ApplicationRecord.connection.execute <<-SQL.squish + TRUNCATE TABLE ahoy_events; + REINDEX TABLE ahoy_events; + TRUNCATE TABLE ahoy_visits; + REINDEX TABLE ahoy_visits; + SQL + end + + task trim_api_requests: :environment do + cutoff = 2.years.ago.to_date.to_s + + puts "Removing records of API Request data prior to #{cutoff}" + + ApplicationRecord.connection.execute <<-SQL.squish + DELETE FROM api_requests + WHERE updated_at <= '#{cutoff}' + ; + SQL + + ApplicationRecord.connection.execute 'ANALYSE api_requests;' + ApplicationRecord.connection.execute 'REINDEX TABLE api_requests;' + end + task drop_temporary_tables: :environment do puts 'Dropping temporary tables' + ApplicationRecord.connection.execute 'SELECT * FROM drop_eu_lc_mviews()' end + + ## + # Reclaims space from any table where the number of dead tuples (old versions + # of rows left over by postgres as a result of updates/deletes) is greater + # than the number of live tuples. In effect, this means that wherever the size + # on disk of a table can be reduced by about 50% or more, a full-table rewrite + # will be performed. This requires a exclusive lock on the entire table. + task vacuum_full: :environment do + puts 'Finding tables that need vacuuming' + + vaccumable_sql = <<-SQL.squish + SELECT + st.schemaname, + st.relname, + n_live_tup, + n_dead_tup + FROM pg_catalog.pg_stat_all_tables st + JOIN pg_catalog.pg_class r + ON st.relid = r.oid + AND r.relkind = 't' + WHERE st.schemaname = 'public' + AND n_dead_tup > n_live_tup + SQL + + vacuumables = ApplicationRecord.connection.execute vaccumable_sql + + puts ( + vacuumables.map do |row| + %Q("#{row['schemaname']}"."#{row['relname']}": #{row['n_dead_tup']} dead, #{row['n_live_tup']} live") + end.join("\n") + ) + + ApplicationRecord.connection.execute <<-SQL.squish + DO $do$ + DECLARE + schemaname TEXT; + relname TEXT; + BEGIN + FOR schemaname, relname IN #{vaccumable_sql} + LOOP + EXECUTE format( + 'VACUUM FULL ANALYSE %I$1.%I$2', + schemaname, relname + ); + END LOOP; + RETURN; + END; + $do$; + SQL + end end