diff --git a/.github/workflows/offline.yml b/.github/workflows/offline.yml index 80d78e178..c1ab0117b 100644 --- a/.github/workflows/offline.yml +++ b/.github/workflows/offline.yml @@ -1,15 +1,15 @@ on: push: branches: [master, develop] - tags: [ v* ] + tags: [v*] paths-ignore: - - '*.md' - - '**/*.md' + - "*.md" + - "**/*.md" pull_request: branches: [master, develop] paths-ignore: - - '*.md' - - '**/*.md' + - "*.md" + - "**/*.md" jobs: # Build default profile and create local assets build-default: @@ -167,16 +167,16 @@ jobs: - name: Process the demo profile build run: ./offline/demo-build/build.sh env: - GPG_PRIVATE_KEY: '${{ secrets.GPG_PRIVATE_KEY }}' - DOCKER_LOGIN: '${{ secrets.DOCKER_LOGIN }}' + GPG_PRIVATE_KEY: "${{ secrets.GPG_PRIVATE_KEY }}" + DOCKER_LOGIN: "${{ secrets.DOCKER_LOGIN }}" - name: Copy demo build assets tarball to S3 run: | aws s3 cp offline/demo-build/output/assets.tgz s3://public.wire.com/artifacts/wire-server-deploy-static-demo-${{ steps.upload_name.outputs.UPLOAD_NAME }}.tgz echo "Uploaded to: https://s3-$AWS_REGION.amazonaws.com/public.wire.com/artifacts/wire-server-deploy-static-demo-${{ steps.upload_name.outputs.UPLOAD_NAME }}.tgz" env: - AWS_ACCESS_KEY_ID: '${{ secrets.AWS_ACCESS_KEY_ID }}' - AWS_SECRET_ACCESS_KEY: '${{ secrets.AWS_SECRET_ACCESS_KEY }}' + AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}" AWS_REGION: "eu-west-1" - name: Cleanup demo build assets @@ -208,16 +208,16 @@ jobs: - name: Process the min profile build run: ./offline/min-build/build.sh env: - GPG_PRIVATE_KEY: '${{ secrets.GPG_PRIVATE_KEY }}' - DOCKER_LOGIN: '${{ secrets.DOCKER_LOGIN }}' + GPG_PRIVATE_KEY: "${{ secrets.GPG_PRIVATE_KEY }}" + DOCKER_LOGIN: "${{ secrets.DOCKER_LOGIN }}" - name: Copy min build assets tarball to S3 run: | aws s3 cp offline/min-build/output/assets.tgz s3://public.wire.com/artifacts/wire-server-deploy-static-min-${{ steps.upload_name.outputs.UPLOAD_NAME }}.tgz echo "Uploaded to: https://s3-$AWS_REGION.amazonaws.com/public.wire.com/artifacts/wire-server-deploy-static-min-${{ steps.upload_name.outputs.UPLOAD_NAME }}.tgz" env: - AWS_ACCESS_KEY_ID: '${{ secrets.AWS_ACCESS_KEY_ID }}' - AWS_SECRET_ACCESS_KEY: '${{ secrets.AWS_SECRET_ACCESS_KEY }}' + AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}" AWS_REGION: "eu-west-1" - name: Cleanup min build assets diff --git a/.gitignore b/.gitignore index 52924fc27..1c0bd60e4 100644 --- a/.gitignore +++ b/.gitignore @@ -18,7 +18,7 @@ values-init-done # Envrc local overrides .envrc.local - +.vscode # Nix-created result symlinks result result-* diff --git a/ansible/inventory/offline/99-static b/ansible/inventory/offline/99-static index d8b6b3efe..990cab90e 100644 --- a/ansible/inventory/offline/99-static +++ b/ansible/inventory/offline/99-static @@ -85,8 +85,7 @@ postgresql_network_interface = enp1s0 wire_dbname = wire-server wire_user = wire-server -# if not defined, a random password will be generated -# wire_pass = verysecurepassword +wire_namespace = default # Kubernetes namespace for secret storage [elasticsearch:vars] # elasticsearch_network_interface = enp1s0 diff --git a/ansible/inventory/offline/group_vars/postgresql/postgresql.yml b/ansible/inventory/offline/group_vars/postgresql/postgresql.yml index 1a6242129..ec59376cd 100644 --- a/ansible/inventory/offline/group_vars/postgresql/postgresql.yml +++ b/ansible/inventory/offline/group_vars/postgresql/postgresql.yml @@ -3,10 +3,44 @@ postgresql_version: 17 postgresql_data_dir: /var/lib/postgresql/{{ postgresql_version }}/main postgresql_conf_dir: /etc/postgresql/{{ postgresql_version }}/main -# Replication services configuration -repsvc_user: repsvc -repsvc_password: "securepassword" -repsvc_database: repsvc_db +# repmgr HA configuration +repmgr_user: repmgr +repmgr_password: "securepassword" +repmgr_database: repmgr + +# Node configuration for repmgr +repmgr_node_config: + postgresql1: # Maps to postgresql_rw group + node_id: 1 + priority: 150 + role: primary + postgresql2: # Maps to first postgresql_ro + node_id: 2 + priority: 100 + role: standby + postgresql3: # Maps to second postgresql_ro + node_id: 3 + priority: 50 + role: standby + +# repmgr settings +# repmgrd monitoring and reconnection configuration +# Reference: https://repmgr.org/docs/current/repmgrd-basic-configuration.html +# +# monitor_interval_secs: Interval in seconds between monitoring checks +# - Default: 2 seconds +# - Controls how frequently repmgr monitors the primary server status +# +# reconnect_attempts: Maximum number of reconnection attempts +# - Default: 6 attempts +# - Number of times repmgr will attempt to reconnect to a failed primary +# +# reconnect_interval: Interval in seconds between reconnection attempts +# - Default: 10 seconds +# - Time to wait between each reconnection attempt +monitor_interval_secs: 2 +reconnect_attempts: 6 +reconnect_interval: 5 # Use local packages instead of repository postgresql_use_repository: false # Set to true to use local packages from urls @@ -35,3 +69,12 @@ postgresql_pkgs: - name: python3-psycopg2 url: "{{ binaries_url }}/python3-psycopg2_2.9.10-1.pgdg22.04+1_amd64.deb" checksum: "sha256:cc2f749e3af292a67e012edeb4aa5d284f57f2d66a9a09fe5b81e5ffda73cab4" + - name: repmgr-common + url: "{{ binaries_url }}/repmgr-common_5.5.0+debpgdg-1.pgdg22.04+1_all.deb" + checksum: "sha256:34c660c66a9710fd4f20a66cc932741d3399dbba7e7ae4b67468b3e18f65f61c" + - name: repmgr + url: "{{ binaries_url }}/repmgr_5.5.0+debpgdg-1.pgdg22.04+1_all.deb" + checksum: "sha256:20c280811e758106335df1eb9954b61aa552823d3129f1e38c488fbd5efe0567" + - name: postgresql-17-repmgr + url: "{{ binaries_url }}/postgresql-17-repmgr_5.5.0+debpgdg-1.pgdg22.04+1_amd64.deb" + checksum: "sha256:520d6ed4d540a2bb9174ac8276f8cb686c0268c13cccb89b28a9cdbd12049df8" \ No newline at end of file diff --git a/ansible/postgresql-deploy.yml b/ansible/postgresql-deploy.yml index a2697d27c..8bcab6018 100644 --- a/ansible/postgresql-deploy.yml +++ b/ansible/postgresql-deploy.yml @@ -1,3 +1,9 @@ +- name: Clean previous deployment state + import_playbook: postgresql-playbooks/clean_existing_setup.yml + tags: + - postgresql + - cleanup + - name: Install PostgreSQL packages import_playbook: postgresql-playbooks/postgresql-install.yml tags: @@ -27,3 +33,9 @@ tags: - postgresql - wire-setup + +- name: Deploy cluster monitoring + import_playbook: postgresql-playbooks/postgresql-monitoring.yml + tags: + - postgresql + - monitoring diff --git a/ansible/postgresql-playbooks/clean_existing_setup.yml b/ansible/postgresql-playbooks/clean_existing_setup.yml new file mode 100644 index 000000000..d7598ec84 --- /dev/null +++ b/ansible/postgresql-playbooks/clean_existing_setup.yml @@ -0,0 +1,173 @@ +- name: Clean previous deployment state + hosts: "{{ target_nodes | default('postgresql_rw,postgresql_ro') }}" + become: yes + tasks: + # ===== DETECT INSTALLATION TYPE ===== + - name: Check if PostgreSQL is installed + stat: + path: "/usr/bin/psql" + register: postgresql_installed + + - name: Check if PostgreSQL data directory exists + stat: + path: "/var/lib/postgresql/{{ postgresql_version }}/main/PG_VERSION" + register: postgresql_data_exists + + - name: Check if repmgr configuration exists + stat: + path: "/etc/repmgr/{{ postgresql_version }}-main/repmgr.conf" + register: repmgr_config_exists + + - name: Determine if this is a fresh installation + set_fact: + is_fresh_install: >- + {{ + not postgresql_installed.stat.exists or + not postgresql_data_exists.stat.exists or + not repmgr_config_exists.stat.exists + }} + + - name: Display installation type + debug: + msg: | + {{ inventory_hostname }}: {{ 'Fresh installation detected - skipping most cleanup tasks' if is_fresh_install else 'Existing deployment detected - performing full cleanup' }} + + # ===== FRESH INSTALLATION TASKS (MINIMAL) ===== + - name: Handle fresh installation + block: + - name: Ensure basic directories exist for fresh install + file: + path: "{{ item }}" + state: directory + owner: postgres + group: postgres + mode: "0755" + loop: + - "/etc/repmgr/{{ postgresql_version }}-main" + - "/opt/repmgr/scripts" + - "/var/log/postgresql" + when: postgresql_installed.stat.exists + + - name: Skip cleanup message for fresh install + debug: + msg: "Fresh installation - cleanup tasks skipped" + + when: is_fresh_install + + # ===== EXISTING DEPLOYMENT CLEANUP ===== + - name: Handle existing deployment cleanup + block: + - name: Check if PostgreSQL service exists + systemd: + name: "postgresql@{{ postgresql_version }}-main.service" + register: postgresql_service_exists + failed_when: false + + - name: Check if repmgr database exists + ansible.builtin.shell: | + sudo -u postgres psql -t -A -c "SELECT COUNT(*) FROM pg_database WHERE datname = '{{ repmgr_database }}'" postgres 2>/dev/null || echo "0" + register: repmgr_db_exists + changed_when: false + failed_when: false + when: + - postgresql_installed.stat.exists + - postgresql_service_exists.status is defined + - postgresql_service_exists.status.LoadState != "not-found" + + - name: Drop repmgr database completely (if exists) + ansible.builtin.shell: | + sudo -u postgres psql -c "DROP DATABASE IF EXISTS {{ repmgr_database }};" postgres 2>/dev/null || true + failed_when: false + when: + - postgresql_installed.stat.exists + - repmgr_db_exists is defined + - repmgr_db_exists.stdout | default('0') | trim != '0' + + - name: Stop any existing split-brain monitoring timer + systemd: + name: detect-rogue-primary.timer + state: stopped + failed_when: false + + - name: Stop any existing split-brain monitoring service + systemd: + name: detect-rogue-primary.service + state: stopped + failed_when: false + + - name: Stop any existing repmgrd service + systemd: + name: "repmgrd@{{ postgresql_version }}-main.service" + state: stopped + failed_when: false + + - name: Unmask PostgreSQL services from previous deployments + systemd: + name: "postgresql@{{ postgresql_version }}-main.service" + masked: no + failed_when: false + + - name: Stop PostgreSQL service for clean state + systemd: + name: "postgresql@{{ postgresql_version }}-main.service" + state: stopped + failed_when: false + + - name: Remove repmgr configuration files, scripts, and systemd units + file: + path: "{{ item }}" + state: absent + failed_when: false + loop: + - "/etc/repmgr/{{ postgresql_version }}-main/repmgr.conf" + - "/etc/repmgr/{{ postgresql_version }}" + - "/etc/repmgr/{{ postgresql_version }}-main" + - "/var/lib/postgresql/{{ postgresql_version }}/main/recovery.conf" + - "/var/lib/postgresql/{{ postgresql_version }}/main/standby.signal" + - "/opt/repmgr/scripts" + - "/usr/local/bin/repmgr" + - "/usr/local/bin/repmgrd" + - "/usr/local/bin/detect_rogue_primary.sh" + - "/etc/systemd/system/detect-rogue-primary.service" + - "/etc/systemd/system/detect-rogue-primary.timer" + - "/etc/systemd/system/repmgrd@.service" + - "/etc/systemd/system/repmgrd@{{ postgresql_version }}-main.service" + - "/etc/systemd/system/repmgrd@{{ postgresql_version }}.service" + - "/etc/sudoers.d/postgres-postgresql-management" + - "/etc/sudoers.d/postgres-postgresql-service" + + - name: Find rogue split-brain service files + find: + paths: /etc/systemd/system + patterns: "detect-rogue-primary.service*" + register: rogue_service_files + + - name: Remove rogue split-brain service files + file: + path: "{{ item.path }}" + state: absent + loop: "{{ rogue_service_files.files }}" + when: rogue_service_files.matched > 0 + + when: not is_fresh_install + + # ===== COMMON TASKS FOR ALL INSTALLATIONS ===== + - name: Reload systemd daemon after cleanup + systemd: + daemon_reload: yes + failed_when: false + + - name: Display cleanup status + debug: + msg: | + Cleanup completed for {{ inventory_hostname }}: + - Installation type: {{ 'Fresh' if is_fresh_install else 'Existing' }} + - PostgreSQL installed: {{ postgresql_installed.stat.exists }} + - PostgreSQL data exists: {{ postgresql_data_exists.stat.exists }} + - repmgr config exists: {{ repmgr_config_exists.stat.exists }} + {% if is_fresh_install %} + - Action taken: Minimal setup (directories created) + {% else %} + - Action taken: Full cleanup (services stopped, configs removed) + {% endif %} + - Ready for deployment: ✅ diff --git a/ansible/postgresql-playbooks/postgresql-deploy-primary.yml b/ansible/postgresql-playbooks/postgresql-deploy-primary.yml index 5f48b4a2e..e27e4a3c2 100644 --- a/ansible/postgresql-playbooks/postgresql-deploy-primary.yml +++ b/ansible/postgresql-playbooks/postgresql-deploy-primary.yml @@ -1,4 +1,5 @@ -- name: Deploy PostgreSQL Primary node +--- +- name: Deploy PostgreSQL Primary node (Basic Setup) hosts: postgresql_rw become: yes gather_facts: yes @@ -6,87 +7,225 @@ primary_node: "{{ hostvars[(groups.get('postgresql_rw', []) | first) | default('postgresql1')]['ansible_default_ipv4']['address'] | default(hostvars[(groups.get('postgresql_rw', []) | first) | default('postgresql1')]['ansible_host'] | default((groups.get('postgresql_rw', []) | first) | default('postgresql1'))) }}" replica_node1: "{{ hostvars[(groups.get('postgresql_ro', []) | first) | default('postgresql2')]['ansible_default_ipv4']['address'] | default(hostvars[(groups.get('postgresql_ro', []) | first) | default('postgresql2')]['ansible_host'] | default((groups.get('postgresql_ro', []) | first) | default('postgresql2'))) }}" replica_node2: "{{ hostvars[(groups.get('postgresql_ro', []) | last) | default('postgresql3')]['ansible_default_ipv4']['address'] | default(hostvars[(groups.get('postgresql_ro', []) | last) | default('postgresql3')]['ansible_host'] | default((groups.get('postgresql_ro', []) | last) | default('postgresql3'))) }}" - + pg_service_name: "postgresql@{{ postgresql_version }}-main.service" tasks: - - name: Check replication user exists - community.postgresql.postgresql_query: - login_db: postgres - query: "SELECT 1 FROM pg_roles WHERE rolname = '{{ repsvc_user }}'" - register: repl_user_exists - become: yes - become_user: postgres - ignore_errors: yes - - - name: Check replication slots exist - community.postgresql.postgresql_query: - login_db: postgres - query: "SELECT slot_name FROM pg_replication_slots WHERE slot_name IN ('postgresql2', 'postgresql3')" - register: existing_slots - become: yes - become_user: postgres - ignore_errors: yes - - - name: Configure pg_hba.conf - ansible.builtin.template: - src: ../templates/pg_hba.conf.j2 - dest: "{{ postgresql_conf_dir }}/pg_hba.conf" + - name: Ensure repmgr scripts directory exists + ansible.builtin.file: + path: /opt/repmgr/scripts + state: directory owner: postgres group: postgres - mode: '0640' - backup: yes + mode: "0755" + + - name: Ensure repmgr configuration directory exists + ansible.builtin.file: + path: "/etc/repmgr/{{ postgresql_version }}-main" + state: directory + owner: postgres + group: postgres + mode: "0755" - - name: Configure primary node PostgreSQL settings + - name: Deploy basic primary configuration files ansible.builtin.template: - src: ../templates/postgresql_primary.conf.j2 - dest: "{{ postgresql_conf_dir }}/postgresql.conf" + src: "{{ item.src }}" + dest: "{{ item.dest }}" owner: postgres group: postgres - mode: '0640' + mode: "{{ item.mode }}" backup: yes + loop: + - src: ../templates/postgresql/pg_hba.conf.j2 + dest: "{{ postgresql_conf_dir }}/pg_hba.conf" + mode: "0640" + - src: ../templates/postgresql/postgresql.conf.j2 + dest: "{{ postgresql_conf_dir }}/postgresql.conf" + mode: "0640" + - src: ../templates/postgresql/repmgr.conf.j2 + dest: "/etc/repmgr/{{ postgresql_version }}-main/repmgr.conf" + mode: "0644" + - src: ../templates/postgresql/pgpass.j2 + dest: "/var/lib/postgresql/.pgpass" + mode: "0600" + - src: ../templates/postgresql/simple_fence.sh.j2 + dest: "/opt/repmgr/scripts/simple_fence.sh" + mode: "0755" + - src: ../templates/postgresql/failover_validation.sh.j2 + dest: "/opt/repmgr/scripts/failover_validation.sh" + mode: "0755" register: primary_conf_result - - name: restart postgresql primary - ansible.builtin.service: - name: postgresql + - name: Deploy repmgrd service template + ansible.builtin.template: + src: ../templates/postgresql/repmgrd_service.j2 + dest: "/etc/systemd/system/repmgrd@.service" + owner: root + group: root + mode: "0644" + register: repmgrd_service_result + + - name: Reload systemd if service template changed + ansible.builtin.command: systemctl daemon-reload + when: repmgrd_service_result.changed + + - name: Restart PostgreSQL if configuration changed + ansible.builtin.systemd: + name: "{{ pg_service_name }}" state: restarted - become: yes + masked: no when: primary_conf_result.changed - - name: Ensure PostgreSQL service is running - ansible.builtin.service: - name: postgresql + - name: Ensure PostgreSQL instance is running and enabled + ansible.builtin.systemd: + name: "{{ pg_service_name }}" state: started enabled: yes + masked: no - - name: Wait for PostgreSQL to be ready + - name: Wait for PostgreSQL to be ready ansible.builtin.wait_for: port: 5432 - host: "{{ primary_node }}" + host: "127.0.0.1" delay: 5 timeout: 60 - - name: Create replication user - community.postgresql.postgresql_user: - name: "{{ repsvc_user }}" - password: "{{ repsvc_password }}" - role_attr_flags: "REPLICATION,LOGIN" - login_db: postgres - state: present - become: yes - become_user: postgres - when: - - repl_user_exists.failed or (repl_user_exists.query_result | length == 0) - - - name: Create replication slots for replicas - community.postgresql.postgresql_slot: - name: "{{ item }}" - slot_type: physical - state: present - login_db: postgres - loop: - - "postgresql2" - - "postgresql3" - become: yes - become_user: postgres - when: - - existing_slots.failed or (item not in (existing_slots.query_result | default([]) | map(attribute='slot_name') | list)) + # ===== DATABASE SETUP ===== + - name: Setup repmgr infrastructure + block: + - name: Check if repmgr user exists + ansible.builtin.shell: | + sudo -u postgres psql -tAc "SELECT COUNT(*) FROM pg_roles WHERE rolname='{{ repmgr_user }}';" + register: repmgr_user_check + changed_when: false + + - name: Create repmgr user with proper roles + ansible.builtin.shell: | + sudo -u postgres psql -c "CREATE USER {{ repmgr_user }} WITH PASSWORD '{{ repmgr_password }}' SUPERUSER REPLICATION LOGIN;" + when: repmgr_user_check.stdout.strip() == "0" + register: create_repmgr_user + + - name: Display user creation result + ansible.builtin.debug: + msg: | + repmgr user status: {{ 'CREATED' if repmgr_user_check.stdout.strip() == "0" else 'ALREADY EXISTS' }} + when: create_repmgr_user is defined + + - name: Check if repmgr database exists + ansible.builtin.shell: | + sudo -u postgres psql -tAc "SELECT COUNT(*) FROM pg_database WHERE datname='{{ repmgr_database }}';" + register: repmgr_db_check + changed_when: false + - name: Debug repmgr_db_check value + ansible.builtin.debug: + msg: "Database exists count: {{ repmgr_db_check.stdout.strip() }}" + - name: Create repmgr database with proper owner + ansible.builtin.shell: | + sudo -u postgres createdb -O "{{ repmgr_user }}" "{{ repmgr_database }}" + when: repmgr_db_check.stdout.strip() == "0" + register: create_repmgr_db + timeout: 30 + + - name: Display database creation result + ansible.builtin.debug: + msg: | + repmgr database status: {{ 'CREATED' if repmgr_db_check.stdout.strip() == "0" else 'ALREADY EXISTS' }} + when: create_repmgr_db is defined + + - name: Create repmgr extension + ansible.builtin.shell: | + sudo -u postgres psql -d "{{ repmgr_database }}" -c "CREATE EXTENSION IF NOT EXISTS repmgr;" + register: repmgr_ext_result + changed_when: "'CREATE EXTENSION' in repmgr_ext_result.stdout" + + - name: Verify user roles and database ownership + ansible.builtin.shell: | + echo "=== User Roles ===" + sudo -u postgres psql -c "\du {{ repmgr_user }}" + echo "=== Database Owner ===" + sudo -u postgres psql -c "SELECT datname, datdba::regrole FROM pg_database WHERE datname='{{ repmgr_database }}';" + register: verify_setup + changed_when: false + + - name: Display verification results + ansible.builtin.debug: + msg: | + Setup verification: + {{ verify_setup.stdout }} + + # ===== REPMGR REGISTRATION ===== + - name: Register primary in the cluster + block: + - name: Register as primary + ansible.builtin.command: > + sudo -u postgres repmgr + -f /etc/repmgr/{{ postgresql_version }}-main/repmgr.conf + primary register --force + register: repmgr_register_primary + + - name: Verify primary registration + ansible.builtin.shell: | + sudo -u postgres psql -d "{{ repmgr_database }}" \ + -tc "SELECT COUNT(*)::int FROM repmgr.nodes WHERE type = 'primary' AND node_name = '{{ inventory_hostname }}';" + register: verify_primary + changed_when: false + + - name: Display registration status + ansible.builtin.debug: + msg: | + Primary registration result: + - Command output: {{ repmgr_register_primary.stdout | default('') }} + - Primary nodes found: {{ verify_primary.stdout.strip() }} + - Status: {{ 'SUCCESS' if (verify_primary.stdout.strip() | int) == 1 else 'NEEDS_ATTENTION' }} + + - name: Fail if primary registration unsuccessful + ansible.builtin.fail: + msg: | + Primary registration verification failed! + Expected: 1 primary node named '{{ inventory_hostname }}' + Found: {{ verify_primary.stdout.strip() }} primary nodes + + Debug commands: + sudo -u postgres repmgr cluster show + sudo -u postgres psql -d {{ repmgr_database }} -c "SELECT * FROM repmgr.nodes;" + when: (verify_primary.stdout.strip() | int) != 1 + + - name: Display cluster status + ansible.builtin.command: + cmd: sudo -u postgres repmgr -f /etc/repmgr/{{ postgresql_version }}-main/repmgr.conf cluster show + register: repmgr_verify + failed_when: false + + - name: Verify repmgr database connectivity + ansible.builtin.shell: | + sudo -u postgres psql -d "{{ repmgr_database }}" -c "SELECT version();" + environment: + PGPASSWORD: "{{ repmgr_password }}" + register: repmgr_connection_test + + - name: Start repmgrd service + ansible.builtin.systemd: + name: "repmgrd@{{ postgresql_version }}-main" + state: started + enabled: yes + daemon_reload: yes + when: repmgr_connection_test is succeeded + + - name: Verify repmgrd is running + ansible.builtin.systemd: + name: "repmgrd@{{ postgresql_version }}-main" + register: repmgrd_status + + - name: Display repmgrd status + ansible.builtin.debug: + msg: "repmgrd service: {{ repmgrd_status.status.ActiveState }}" + + - name: Display setup completion + ansible.builtin.debug: + msg: | + ===== PRIMARY SETUP COMPLETE ===== + PostgreSQL Primary is running on {{ ansible_hostname }} + Service: {{ pg_service_name }} + repmgrd: {{ repmgrd_status.status.ActiveState | default('unknown') }} + Cluster Status: + {{ repmgr_verify.stdout_lines | default(['Run repmgr cluster show manually']) | join('\n') }} + + Next: Deploy replicas using postgresql-deploy-replica.yml diff --git a/ansible/postgresql-playbooks/postgresql-deploy-replica.yml b/ansible/postgresql-playbooks/postgresql-deploy-replica.yml index a79a5634b..e0a467c16 100644 --- a/ansible/postgresql-playbooks/postgresql-deploy-replica.yml +++ b/ansible/postgresql-playbooks/postgresql-deploy-replica.yml @@ -1,108 +1,302 @@ --- -- name: Deploy PostgreSQL replica services with streaming replication +- name: Deploy PostgreSQL replica services (Basic Setup) hosts: postgresql_ro become: yes gather_facts: yes + serial: 1 # Deploy replicas one at a time vars: - primary_node: "{{ hostvars[(groups.get('postgresql_rw', []) | first) | default('postgresql1')]['ansible_default_ipv4']['address'] | default(hostvars[(groups.get('postgresql_rw', []) | first) | default('postgresql1')]['ansible_host'] | default((groups.get('postgresql_rw', []) | first) | default('postgresql1'))) }}" - replica_node1: "{{ hostvars[(groups.get('postgresql_ro', []) | first) | default('postgresql2')]['ansible_default_ipv4']['address'] | default(hostvars[(groups.get('postgresql_ro', []) | first) | default('postgresql2')]['ansible_host'] | default((groups.get('postgresql_ro', []) | first) | default('postgresql2'))) }}" - replica_node2: "{{ hostvars[(groups.get('postgresql_ro', []) | last) | default('postgresql3')]['ansible_default_ipv4']['address'] | default(hostvars[(groups.get('postgresql_ro', []) | last) | default('postgresql3')]['ansible_host'] | default((groups.get('postgresql_ro', []) | last) | default('postgresql3'))) }}" - + primary_node: "{{ hostvars[groups['postgresql_rw'][0]]['ansible_default_ipv4']['address'] | default(hostvars[groups['postgresql_rw'][0]]['ansible_host']) }}" + current_replica: "{{ ansible_default_ipv4.address | default(ansible_host) }}" + pg_service_name: "postgresql@{{ postgresql_version }}-main.service" tasks: - - name: Check if replica is already configured - ansible.builtin.stat: - path: "{{ postgresql_data_dir }}/standby.signal" - register: replica_configured - - - - name: Check if PostgreSQL is running - ansible.builtin.service_facts: - register: service_status - - - - name: Configure pg_hba.conf for all nodes - ansible.builtin.template: - src: ../templates/pg_hba.conf.j2 - dest: "{{ postgresql_conf_dir }}/pg_hba.conf" + # ===== INITIAL STATUS CHECK ===== + - name: Check replica configuration status + block: + - name: Check repmgr registration status + ansible.builtin.command: + cmd: sudo -u postgres repmgr -f /etc/repmgr/{{ postgresql_version }}-main/repmgr.conf node status + register: repmgr_status + failed_when: false + changed_when: false + + - name: Check if replica is already configured + ansible.builtin.stat: + path: "{{ postgresql_data_dir }}/standby.signal" + register: replica_configured + + - name: Display current status + ansible.builtin.debug: + msg: | + Replica Status for {{ ansible_hostname }}: + - repmgr registered: {{ repmgr_status.rc == 0 }} + - Data configured: {{ replica_configured.stat.exists }} + - Action needed: {{ not replica_configured.stat.exists }} + + # ===== CONFIGURATION DEPLOYMENT ===== + - name: Ensure repmgr configuration directory exists + ansible.builtin.file: + path: "/etc/repmgr/{{ postgresql_version }}-main" + state: directory owner: postgres group: postgres - mode: '0640' - backup: yes + mode: "0755" - - name: Configure replica nodes PostgreSQL settings - ansible.builtin.template: - src: ../templates/postgresql_replica.conf.j2 - dest: "{{ postgresql_conf_dir }}/postgresql.conf" + - name: Ensure repmgr scripts directory exists + ansible.builtin.file: + path: /opt/repmgr/scripts + state: directory owner: postgres group: postgres - mode: '0640' - backup: yes - - register: replica_conf_result - - - name: restart postgresql replica - ansible.builtin.service: - name: postgresql - state: restarted - become: yes - when: - - inventory_hostname in ["postgresql2", "postgresql3"] - - replica_conf_result is defined - - replica_conf_result.changed - - - name: Stop PostgreSQL if replication not configured - ansible.builtin.service: - name: postgresql - state: stopped - when: - - inventory_hostname in ["postgresql2", "postgresql3"] - - not replica_configured.stat.exists - - - name: Clean replica data directories only if replication not configured - ansible.builtin.file: - path: "{{ postgresql_data_dir }}" - state: absent - when: - - inventory_hostname in ["postgresql2", "postgresql3"] - - not replica_configured.stat.exists - become: yes - - - name: Run pg_basebackup for replicas - ansible.builtin.command: - cmd: > - /usr/bin/pg_basebackup - -h {{ primary_node }} - -U {{ repsvc_user }} - -p 5432 - -D {{ postgresql_data_dir }} - -P -R -X stream - environment: - PGPASSWORD: "{{ repsvc_password }}" - when: - - inventory_hostname in ["postgresql2", "postgresql3"] - - not replica_configured.stat.exists - become: yes - become_user: postgres - - - name: Create standby.signal file for replicas - ansible.builtin.file: - path: "{{ postgresql_data_dir }}/standby.signal" - state: touch + mode: "0755" + + - name: Deploy replica configuration files + ansible.builtin.template: + src: "{{ item.src }}" + dest: "{{ item.dest }}" owner: postgres group: postgres - mode: '0640' - when: - - inventory_hostname in ["postgresql2", "postgresql3"] - - not replica_configured.stat.exists - - - name: Start PostgreSQL service - ansible.builtin.service: - name: postgresql + mode: "{{ item.mode }}" + backup: yes + loop: + - src: ../templates/postgresql/pg_hba.conf.j2 + dest: "{{ postgresql_conf_dir }}/pg_hba.conf" + mode: "0640" + - src: ../templates/postgresql/postgresql.conf.j2 + dest: "{{ postgresql_conf_dir }}/postgresql.conf" + mode: "0640" + - src: ../templates/postgresql/repmgr.conf.j2 + dest: "/etc/repmgr/{{ postgresql_version }}-main/repmgr.conf" + mode: "0644" + - src: ../templates/postgresql/pgpass.j2 + dest: "/var/lib/postgresql/.pgpass" + mode: "0600" + - src: ../templates/postgresql/simple_fence.sh.j2 + dest: "/opt/repmgr/scripts/simple_fence.sh" + mode: "0755" + - src: ../templates/postgresql/failover_validation.sh.j2 + dest: "/opt/repmgr/scripts/failover_validation.sh" + mode: "0755" + + - name: Deploy repmgrd service template + ansible.builtin.template: + src: ../templates/postgresql/repmgrd_service.j2 + dest: "/etc/systemd/system/repmgrd@.service" + owner: root + group: root + mode: "0644" + register: repmgrd_service_result + + - name: Reload systemd if service template changed + ansible.builtin.command: systemctl daemon-reload + when: repmgrd_service_result.changed + + # ===== REPLICATION SETUP ===== + - name: Setup repmgr replication + block: + - name: Verify primary accessibility + ansible.builtin.wait_for: + port: 5432 + host: "{{ primary_node }}" + timeout: 60 + + - name: Test primary connection with repmgr credentials + community.postgresql.postgresql_query: + login_host: "{{ primary_node }}" + login_user: "{{ repmgr_user }}" + login_password: "{{ repmgr_password }}" + login_db: "{{ repmgr_database }}" + query: "SELECT 'Connection successful' as status" + register: primary_connection_test + + - name: Prepare for replication setup + block: + - name: Stop PostgreSQL service + ansible.builtin.systemd: + name: "{{ pg_service_name }}" + state: stopped + + - name: Remove existing data directory + ansible.builtin.file: + path: "{{ postgresql_data_dir }}" + state: absent + + - name: Create clean data directory + ansible.builtin.file: + path: "{{ postgresql_data_dir }}" + state: directory + owner: postgres + group: postgres + mode: "0700" + + when: not replica_configured.stat.exists + + - name: Clone replica from primary + ansible.builtin.command: + cmd: > + sudo -u postgres repmgr -h {{ primary_node }} -U {{ repmgr_user }} -d {{ repmgr_database }} + -f /etc/repmgr/{{ postgresql_version }}-main/repmgr.conf + standby clone --force + environment: + PGPASSWORD: "{{ repmgr_password }}" + register: repmgr_clone_result + when: not replica_configured.stat.exists + + - name: Verify standby.signal was created by clone + ansible.builtin.stat: + path: "{{ postgresql_data_dir }}/standby.signal" + register: standby_signal_after_clone + when: not replica_configured.stat.exists + + - name: Fail if standby.signal is missing after clone + ansible.builtin.fail: + msg: | + standby.signal not found after clone! + Clone output: {{ repmgr_clone_result.stdout | default('') }} + Clone errors: {{ repmgr_clone_result.stderr | default('') }} + + Debug commands: + ls -la {{ postgresql_data_dir }}/ + sudo -u postgres repmgr node status + when: + - not replica_configured.stat.exists + - not standby_signal_after_clone.stat.exists + + - name: Display clone results + ansible.builtin.debug: + msg: "{{ repmgr_clone_result.stdout_lines | default(['Clone skipped - already configured']) }}" + + - name: Ensure repmgrd service is enabled + ansible.builtin.systemd: + name: "repmgrd@{{ postgresql_version }}-main" + enabled: yes + + # ===== SERVICE STARTUP ===== + - name: Start PostgreSQL service + ansible.builtin.systemd: + name: "{{ pg_service_name }}" state: started enabled: yes - - - name: Wait for replicas to be ready + masked: no + + - name: Wait for PostgreSQL to be ready ansible.builtin.wait_for: port: 5432 + host: "127.0.0.1" delay: 10 timeout: 120 + + # ===== REPLICATION VERIFICATION ===== + - name: Verify replication setup + block: + - name: Check recovery status + community.postgresql.postgresql_query: + login_host: "127.0.0.1" + login_user: "{{ repmgr_user }}" + login_password: "{{ repmgr_password }}" + db: "{{ repmgr_database }}" + query: | + SELECT + pg_is_in_recovery() as is_replica, + pg_last_wal_receive_lsn() as last_wal_received, + CASE + WHEN pg_is_in_recovery() THEN 'REPLICA' + ELSE 'PRIMARY/ERROR' + END as node_role + register: recovery_status + + - name: Display recovery status + ansible.builtin.debug: + msg: | + Replication Status: + - Role: {{ recovery_status.query_result[0].node_role }} + - Last WAL: {{ recovery_status.query_result[0].last_wal_received }} + + - name: Show local standby indicators + ansible.builtin.stat: + path: "{{ postgresql_data_dir }}/standby.signal" + register: standby_signal_present + + - name: Report standby indicators + ansible.builtin.debug: + msg: | + Diagnostics: + - standby.signal present: {{ standby_signal_present.stat.exists }} + - Data dir: {{ postgresql_data_dir }} + - Service: {{ pg_service_name }} + + - name: Verify replica is working + ansible.builtin.fail: + msg: | + Replica setup failed - node is not in recovery mode! + Current role: {{ recovery_status.query_result[0].node_role }} + standby.signal present: {{ standby_signal_present.stat.exists }} + + Check PostgreSQL logs: + sudo tail -50 /var/log/postgresql/postgresql-*.log + when: not recovery_status.query_result[0].is_replica + + # ===== REPMGR REGISTRATION ===== + - name: Register and start repmgr services + block: + - name: Register replica with repmgr + ansible.builtin.command: + cmd: sudo -u postgres repmgr -f /etc/repmgr/{{ postgresql_version }}-main/repmgr.conf standby register --force + when: repmgr_status.rc != 0 + register: repmgr_registration + + - name: Verify replica registration + community.postgresql.postgresql_query: + login_host: "127.0.0.1" + login_user: "{{ repmgr_user }}" + login_password: "{{ repmgr_password }}" + db: "{{ repmgr_database }}" + query: | + SELECT COUNT(*)::int AS cnt + FROM repmgr.nodes + WHERE type = 'standby' AND node_name = '{{ inventory_hostname }}'; + register: verify_replica_reg + changed_when: false + + - name: Display registration results + ansible.builtin.debug: + msg: | + Replica registration result: + - Status: {{ 'SUCCESS' if verify_replica_reg.query_result[0].cnt == 1 else 'FAILED' }} + - Standby nodes found: {{ verify_replica_reg.query_result[0].cnt }} + + - name: Fail if replica registration unsuccessful + ansible.builtin.fail: + msg: | + Replica registration failed for {{ inventory_hostname }}! + Expected: 1 standby node + Found: {{ verify_replica_reg.query_result[0].cnt }} standby nodes + + Debug commands: + sudo -u postgres repmgr cluster show + sudo -u postgres repmgr node status + when: (verify_replica_reg.query_result[0].cnt | int) != 1 + + - name: Start repmgrd service + ansible.builtin.systemd: + name: "repmgrd@{{ postgresql_version }}-main" + state: started + enabled: yes + daemon_reload: yes + + - name: Verify repmgrd is running + ansible.builtin.systemd: + name: "repmgrd@{{ postgresql_version }}-main" + register: repmgrd_status + + - name: Display repmgrd status + ansible.builtin.debug: + msg: "repmgrd service: {{ repmgrd_status.status.ActiveState }}" + + - name: Display setup completion + ansible.builtin.debug: + msg: | + ===== REPLICA SETUP COMPLETE ===== + PostgreSQL Replica is running on {{ ansible_hostname }} + Service: {{ pg_service_name }} + Role: {{ recovery_status.query_result[0].node_role }} + repmgrd: {{ repmgrd_status.status.ActiveState | default('unknown') }} diff --git a/ansible/postgresql-playbooks/postgresql-install.yml b/ansible/postgresql-playbooks/postgresql-install.yml index a2f2139e3..28fe2f0be 100644 --- a/ansible/postgresql-playbooks/postgresql-install.yml +++ b/ansible/postgresql-playbooks/postgresql-install.yml @@ -5,70 +5,277 @@ vars: postgresql_use_repository: false + # Structured package definitions + system_dependencies: + - libssl-dev + - libllvm15 + - sysstat + - ssl-cert + - libjson-perl + - libipc-run-perl + + repository_packages: + - postgresql-{{ postgresql_version }} + - postgresql-client-{{ postgresql_version }} + - python3-psycopg2 + + # Package categorization for offline installation + postgresql_core_packages: "{{ postgresql_pkgs | rejectattr('name', 'match', '^repmgr') | rejectattr('name', 'contains', '-repmgr') | list }}" + + # Ordered repmgr packages (dependency order matters) + repmgr_packages_ordered: + - repmgr-common + - "postgresql-{{ postgresql_version }}-repmgr" + - repmgr + + # Directory structure definitions + repmgr_directories: + - path: "/etc/repmgr/{{ postgresql_version }}" + owner: postgres + group: postgres + mode: "0755" + - path: "/opt/repmgr/scripts" + owner: postgres + group: postgres + mode: "0755" + - path: "/var/log/postgresql" + owner: postgres + group: postgres + mode: "0755" + - path: "/etc/systemd/system/postgresql@{{ postgresql_version }}-main.service.d" + owner: root + group: root + mode: "0755" + tasks: - - name: Install PostgreSQL dependencies + # ===== PHASE 1: SYSTEM DEPENDENCIES ===== + + - name: Install system dependencies become: yes ansible.builtin.apt: - name: - - libssl-dev - - libllvm15 - - sysstat - - ssl-cert - - libjson-perl - - libipc-run-perl + name: "{{ system_dependencies }}" state: present update_cache: yes + cache_valid_time: 3600 + + # ===== PHASE 2: POSTGRESQL INSTALLATION ===== - - name: Install PostgreSQL packages from repository + - name: Install PostgreSQL from repository become: yes ansible.builtin.apt: - name: - - postgresql-{{ postgresql_version }} - - postgresql-client-{{ postgresql_version }} - - python3-psycopg2 + name: "{{ repository_packages }}" state: present update_cache: yes when: postgresql_use_repository - - name: Check installed versions of PostgreSQL packages - ansible.builtin.command: dpkg -s {{ item.name }} - loop: "{{ postgresql_pkgs }}" - register: pkg_check - ignore_errors: yes - changed_when: false + # PostgreSQL offline installation block + - name: Install PostgreSQL from offline packages + block: + - name: Check PostgreSQL package installation status + ansible.builtin.shell: | + if dpkg-query -W -f='${Package}\t${Status}\n' {{ item.name }} 2>/dev/null | grep -q "install ok installed"; then + echo "installed" + else + echo "not_installed" + fi + register: pg_package_status + loop: "{{ postgresql_core_packages }}" + changed_when: false + failed_when: false + + - name: Identify PostgreSQL packages to install + ansible.builtin.set_fact: + pg_packages_to_install: "{{ pg_packages_to_install | default([]) + [item.item] }}" + loop: "{{ pg_package_status.results }}" + when: item.stdout == "not_installed" + + - name: Display PostgreSQL installation plan + ansible.builtin.debug: + msg: | + PostgreSQL Installation Plan: + - Total packages: {{ postgresql_core_packages | length }} + - Already installed: {{ (pg_package_status.results | selectattr('stdout', 'equalto', 'installed') | list | length) }} + - To install: {{ pg_packages_to_install | default([]) | length }} + + - name: Download PostgreSQL packages + ansible.builtin.get_url: + url: "{{ item.url }}" + dest: "/tmp/{{ item.name }}.deb" + checksum: "{{ item.checksum }}" + validate_certs: no + timeout: 30 + loop: "{{ pg_packages_to_install | default([]) }}" + + - name: Install PostgreSQL packages + become: yes + ansible.builtin.apt: + deb: "/tmp/{{ item.name }}.deb" + state: present + loop: "{{ pg_packages_to_install | default([]) }}" + register: pg_installation_result + + - name: Clean up PostgreSQL package files + ansible.builtin.file: + path: "/tmp/{{ item.name }}.deb" + state: absent + loop: "{{ pg_packages_to_install | default([]) }}" + when: not postgresql_use_repository - - name: Download PostgreSQL packages - ansible.builtin.get_url: - url: "{{ item.url }}" - dest: "/tmp/{{ item.url | basename }}" - checksum: "{{ item.checksum }}" - validate_certs: no - loop: "{{ postgresql_pkgs }}" - when: - - not postgresql_use_repository - - pkg_check.results[item_loop_index].rc != 0 - loop_control: - index_var: item_loop_index - - - name: Install PostgreSQL packages from downloaded files + # ===== PHASE 3: REPMGR INSTALLATION ===== + + - name: Install repmgr from repository become: yes ansible.builtin.apt: - deb: "/tmp/{{ item.url | basename }}" - loop: "{{ postgresql_pkgs }}" - when: - - not postgresql_use_repository - - pkg_check.results[item_loop_index].rc != 0 - loop_control: - index_var: item_loop_index - - - name: Clean up downloaded PostgreSQL packages + name: + - repmgr-common + - "postgresql-{{ postgresql_version }}-repmgr" + - repmgr + state: present + when: postgresql_use_repository + + # repmgr offline installation block + - name: Install repmgr from offline packages + block: + - name: Get repmgr packages in correct order + ansible.builtin.set_fact: + repmgr_packages_filtered: "{{ repmgr_packages_filtered | default([]) + [item] }}" + loop: "{{ postgresql_pkgs }}" + when: item.name in repmgr_packages_ordered + + - name: Sort repmgr packages by dependency order + ansible.builtin.set_fact: + repmgr_packages_sorted: | + {%- set sorted_packages = [] -%} + {%- for pkg_name in repmgr_packages_ordered -%} + {%- for pkg in repmgr_packages_filtered -%} + {%- if pkg.name == pkg_name -%} + {%- set _ = sorted_packages.append(pkg) -%} + {%- endif -%} + {%- endfor -%} + {%- endfor -%} + {{ sorted_packages }} + + - name: Check repmgr package installation status + ansible.builtin.shell: | + if dpkg-query -W -f='${Package}\t${Status}\n' {{ item.name }} 2>/dev/null | grep -q "install ok installed"; then + echo "installed" + else + echo "not_installed" + fi + register: repmgr_package_status + loop: "{{ repmgr_packages_sorted }}" + changed_when: false + failed_when: false + + - name: Identify repmgr packages to install + ansible.builtin.set_fact: + repmgr_packages_to_install: "{{ repmgr_packages_to_install | default([]) + [item.item] }}" + loop: "{{ repmgr_package_status.results }}" + when: item.stdout == "not_installed" + + - name: Display repmgr installation plan + ansible.builtin.debug: + msg: | + repmgr Installation Plan: + - Installation order: {{ repmgr_packages_ordered | join(' → ') }} + - To install: {{ repmgr_packages_to_install | default([]) | map(attribute='name') | join(', ') }} + + - name: Download repmgr packages + ansible.builtin.get_url: + url: "{{ item.url }}" + dest: "/tmp/{{ item.name }}.deb" + checksum: "{{ item.checksum }}" + validate_certs: no + timeout: 30 + loop: "{{ repmgr_packages_to_install | default([]) }}" + + - name: Install repmgr packages in dependency order + become: yes + ansible.builtin.apt: + deb: "/tmp/{{ item.name }}.deb" + state: present + loop: "{{ repmgr_packages_to_install | default([]) }}" + register: repmgr_installation_result + failed_when: + - repmgr_installation_result.failed + - "'already installed' not in (repmgr_installation_result.msg | default(''))" + + - name: Clean up repmgr package files + ansible.builtin.file: + path: "/tmp/{{ item.name }}.deb" + state: absent + loop: "{{ repmgr_packages_to_install | default([]) }}" + + when: not postgresql_use_repository + + # ===== PHASE 4: DIRECTORY STRUCTURE AND CLEANUP ===== + + - name: Create repmgr directory structure + become: yes ansible.builtin.file: - path: "/tmp/{{ item.url | basename }}" - state: absent - loop: "{{ postgresql_pkgs }}" - when: - - not postgresql_use_repository - - pkg_check.results[item_loop_index].rc != 0 - loop_control: - index_var: item_loop_index + path: "{{ item.path }}" + state: directory + owner: "{{ item.owner }}" + group: "{{ item.group }}" + mode: "{{ item.mode }}" + loop: "{{ repmgr_directories }}" + + - name: Deploy repmgrd systemd service template + become: yes + ansible.builtin.template: + src: ../templates/postgresql/repmgrd_service.j2 + dest: "/etc/systemd/system/repmgrd@.service" + mode: "0644" + register: repmgrd_service_deployed + + # ===== PHASE 5: SIMPLIFIED INSTALLATION VERIFICATION ===== + + - name: Verify installations + block: + - name: Check PostgreSQL packages + ansible.builtin.shell: | + echo "=== PostgreSQL Packages ===" + dpkg -l | grep "postgresql-{{ postgresql_version }}" | awk '{print $2 ": " $1}' + register: pg_packages + changed_when: false + + - name: Check repmgr packages + ansible.builtin.shell: | + echo "=== repmgr Packages ===" + dpkg -l | grep repmgr | awk '{print $2 ": " $1}' + register: repmgr_packages + changed_when: false + + - name: Check PostgreSQL binaries + ansible.builtin.shell: | + echo "=== PostgreSQL Binaries ===" + ls -la /usr/lib/postgresql/{{ postgresql_version }}/bin/postgres 2>/dev/null && echo "postgres: FOUND" || echo "postgres: MISSING" + ls -la /usr/lib/postgresql/{{ postgresql_version }}/bin/psql 2>/dev/null && echo "psql: FOUND" || echo "psql: MISSING" + register: pg_binaries + changed_when: false + + - name: Check repmgr binary + ansible.builtin.shell: | + echo "=== repmgr Binary ===" + ls -la /usr/bin/repmgr 2>/dev/null && echo "repmgr: FOUND" || echo "repmgr: MISSING" + register: repmgr_binary + changed_when: false + + - name: Display installation summary + ansible.builtin.debug: + msg: | + ===== INSTALLATION COMPLETE ===== + + PostgreSQL Packages: + {{ pg_packages.stdout }} + + repmgr Packages: + {{ repmgr_packages.stdout }} + + Binaries: + {{ pg_binaries.stdout }} + {{ repmgr_binary.stdout }} + + Note: Version checks will work after cluster configuration. + This is an installation-only playbook; cluster setup comes next. diff --git a/ansible/postgresql-playbooks/postgresql-monitoring.yml b/ansible/postgresql-playbooks/postgresql-monitoring.yml new file mode 100644 index 000000000..e1249ff98 --- /dev/null +++ b/ansible/postgresql-playbooks/postgresql-monitoring.yml @@ -0,0 +1,98 @@ +--- +- name: Deploy PostgreSQL cluster monitoring after setup + hosts: postgresql_rw:postgresql_ro + become: yes + tags: + - postgresql-monitoring + - post-deploy + + tasks: + - name: Remove existing sudoers file to force refresh + file: + path: /etc/sudoers.d/postgres-postgresql-service + state: absent + + - name: Configure sudo access for postgres user to manage PostgreSQL service + copy: + content: | + # Allow postgres to control PostgreSQL/repmgrd non-interactively + postgres ALL=(root) NOPASSWD: /bin/systemctl start postgresql@{{ postgresql_version }}-main + postgres ALL=(root) NOPASSWD: /bin/systemctl stop postgresql@{{ postgresql_version }}-main + postgres ALL=(root) NOPASSWD: /bin/systemctl restart postgresql@{{ postgresql_version }}-main + postgres ALL=(root) NOPASSWD: /bin/systemctl reload postgresql@{{ postgresql_version }}-main + postgres ALL=(root) NOPASSWD: /bin/systemctl kill postgresql@{{ postgresql_version }}-main + postgres ALL=(root) NOPASSWD: /bin/systemctl mask postgresql@{{ postgresql_version }}-main + postgres ALL=(root) NOPASSWD: /bin/systemctl unmask postgresql@{{ postgresql_version }}-main + postgres ALL=(root) NOPASSWD: /bin/systemctl start repmgrd@{{ postgresql_version }}-main + postgres ALL=(root) NOPASSWD: /bin/systemctl stop repmgrd@{{ postgresql_version }}-main + # With suffix (used by detect_rogue_primary) + postgres ALL=(root) NOPASSWD: /bin/systemctl start postgresql@{{ postgresql_version }}-main.service + postgres ALL=(root) NOPASSWD: /bin/systemctl stop postgresql@{{ postgresql_version }}-main.service + postgres ALL=(root) NOPASSWD: /bin/systemctl restart postgresql@{{ postgresql_version }}-main.service + postgres ALL=(root) NOPASSWD: /bin/systemctl reload postgresql@{{ postgresql_version }}-main.service + postgres ALL=(root) NOPASSWD: /bin/systemctl kill postgresql@{{ postgresql_version }}-main.service + postgres ALL=(root) NOPASSWD: /bin/systemctl mask postgresql@{{ postgresql_version }}-main.service + postgres ALL=(root) NOPASSWD: /bin/systemctl unmask postgresql@{{ postgresql_version }}-main.service + dest: /etc/sudoers.d/postgres-postgresql-service + mode: "0440" + owner: root + group: root + validate: "visudo -cf %s" + force: yes # This forces overwrite even if content appears identical + + - name: Deploy split-brain detection script + template: + src: ../templates/postgresql/detect_rogue_primary.sh.j2 + dest: /usr/local/bin/detect_rogue_primary.sh + mode: "0755" + owner: postgres + group: postgres + backup: yes + notify: restart monitoring timer + + - name: Create systemd service for split-brain detection + template: + src: ../templates/postgresql/detect-rogue-primary.service.j2 + dest: /etc/systemd/system/detect-rogue-primary.service + mode: "0644" + backup: yes + notify: + - reload systemd + - restart monitoring timer + + - name: Create systemd timer for periodic monitoring + template: + src: ../templates/postgresql/detect-rogue-primary.timer.j2 + dest: /etc/systemd/system/detect-rogue-primary.timer + mode: "0644" + backup: yes + notify: + - reload systemd + - restart monitoring timer + + - name: Enable and start monitoring timer + systemd: + name: detect-rogue-primary.timer + enabled: yes + state: started + daemon_reload: yes + + - name: Verify monitoring service is configured correctly + command: systemctl status detect-rogue-primary.timer + register: timer_status + changed_when: false + + - name: Display monitoring timer status + debug: + msg: "{{ timer_status.stdout_lines }}" + + handlers: + - name: reload systemd + systemd: + daemon_reload: yes + + - name: restart monitoring timer + systemd: + name: detect-rogue-primary.timer + state: restarted + daemon_reload: yes diff --git a/ansible/postgresql-playbooks/postgresql-verify-HA.yml b/ansible/postgresql-playbooks/postgresql-verify-HA.yml index aa6c56066..aacdb5f25 100644 --- a/ansible/postgresql-playbooks/postgresql-verify-HA.yml +++ b/ansible/postgresql-playbooks/postgresql-verify-HA.yml @@ -6,15 +6,18 @@ tasks: - name: Verify streaming replication status on primary community.postgresql.postgresql_query: - login_db: postgres + login_host: "127.0.0.1" + login_user: "{{ repmgr_user }}" + login_password: "{{ repmgr_password }}" + db: "{{ repmgr_database }}" query: | - SELECT - client_addr, - application_name, - state, + SELECT + client_addr, + application_name, + state, sync_state, pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn)) as lag_size, - CASE + CASE WHEN pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) = 0 THEN 'SYNCHRONIZED' WHEN pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) < 1024*1024 THEN 'NEAR_SYNC' ELSE 'LAGGING' @@ -23,53 +26,248 @@ WHERE application_name IN ('postgresql2', 'postgresql3') ORDER BY application_name; register: replication_status - become: yes - become_user: postgres - name: Display streaming replication status ansible.builtin.debug: - var: replication_status.query_result + msg: | + Streaming Replication Status: + {% for replica in replication_status.query_result %} + - {{ replica.application_name }}: {{ replica.state }} ({{ replica.status }}) - Lag: {{ replica.lag_size }} + {% endfor %} + when: replication_status.query_result | length > 0 + + - name: Display no replicas message + ansible.builtin.debug: + msg: "No streaming replicas connected" + when: replication_status.query_result | length == 0 - name: Verify replication slots are active + ansible.builtin.shell: | + sudo -u postgres psql -d postgres -c " + SELECT + slot_name, + active, + pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)) as slot_lag, + CASE + WHEN active THEN 'ACTIVE' + ELSE 'INACTIVE - CHECK REPLICA' + END as slot_status + FROM pg_replication_slots + WHERE slot_name IN ('repmgr_slot_2', 'repmgr_slot_3') + ORDER BY slot_name;" + register: slot_status_raw + changed_when: false + + - name: Display replication slots status + ansible.builtin.debug: + msg: | + Replication Slots Status: + {{ slot_status_raw.stdout }} + + - name: Check WAL disk usage on primary + ansible.builtin.shell: | + sudo -u postgres psql -d postgres -c " + SELECT + pg_size_pretty(sum(size)) as total_wal_size, + count(*) as wal_files, + CASE + WHEN sum(size) > 2147483648 THEN 'WARNING: >2GB WAL usage' + WHEN sum(size) > 1073741824 THEN 'CAUTION: >1GB WAL usage' + ELSE 'OK' + END as wal_status + FROM pg_ls_waldir();" + register: wal_usage_raw + changed_when: false + + - name: Display WAL usage status + ansible.builtin.debug: + msg: | + WAL Usage Status: + {{ wal_usage_raw.stdout }} + + - name: Check repmgr cluster status + ansible.builtin.command: + cmd: sudo -u postgres repmgr -f /etc/repmgr/{{ postgresql_version }}-main/repmgr.conf cluster show + register: cluster_status + changed_when: false + + - name: Display repmgr cluster status + ansible.builtin.debug: + msg: | + repmgr Cluster Status: + {{ cluster_status.stdout_lines | join('\n') }} + + - name: Check repmgr events + ansible.builtin.command: + cmd: sudo -u postgres repmgr -f /etc/repmgr/{{ postgresql_version }}-main/repmgr.conf cluster event --limit=10 + register: cluster_events + changed_when: false + ignore_errors: yes + + - name: Display recent cluster events + ansible.builtin.debug: + msg: | + Recent Cluster Events: + {{ cluster_events.stdout_lines | join('\n') }} + when: cluster_events.rc == 0 + + - name: Verify all nodes are registered and active community.postgresql.postgresql_query: - login_db: postgres + db: "{{ repmgr_database }}" + login_host: "127.0.0.1" + login_user: "{{ repmgr_user }}" + login_password: "{{ repmgr_password }}" query: | - SELECT - slot_name, - active, - pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)) as slot_lag, - CASE + SELECT + node_id, + node_name, + type, + active, + CASE WHEN active THEN 'ACTIVE' - ELSE 'INACTIVE - CHECK REPLICA' - END as slot_status - FROM pg_replication_slots - WHERE slot_name IN ('postgresql2', 'postgresql3') - ORDER BY slot_name; - register: slot_status - become: yes - become_user: postgres + ELSE 'INACTIVE/FENCED' + END as node_status + FROM repmgr.nodes + ORDER BY node_id; + register: node_registration - - name: Display replication slots status + - name: Display node registration status ansible.builtin.debug: - var: slot_status.query_result + msg: | + Node Registration Status: + {% for node in node_registration.query_result %} + - Node {{ node.node_id }} ({{ node.node_name }}): {{ node.type | upper }} - {{ node.node_status }} + {% endfor %} + when: node_registration is defined - - name: Check WAL disk usage on primary + - name: Check PostgreSQL version + ansible.builtin.shell: | + sudo -u postgres psql -d postgres -c "SELECT version();" + register: pg_version_raw + changed_when: false + + - name: Display PostgreSQL version + ansible.builtin.debug: + msg: | + PostgreSQL Version: + {{ pg_version_raw.stdout }} + + - name: Generate health summary + ansible.builtin.shell: | + sudo -u postgres psql -d postgres -c " + SELECT + 'Primary Health Check' as check_type, + COUNT(DISTINCT client_addr) as connected_replicas, + COUNT(*) FILTER (WHERE state = 'streaming') as streaming_replicas, + COUNT(*) FILTER (WHERE sync_state = 'sync') as sync_replicas, + CASE + WHEN COUNT(*) = 0 THEN 'NO_REPLICAS' + WHEN COUNT(*) FILTER (WHERE state = 'streaming') = COUNT(*) THEN 'ALL_STREAMING' + ELSE 'PARTIAL_STREAMING' + END as replication_health + FROM pg_stat_replication;" + register: health_summary_raw + changed_when: false + + - name: Display health summary + ansible.builtin.debug: + msg: | + PostgreSQL HA Health Summary: + {{ health_summary_raw.stdout }} + +# Additional verification on replica nodes +- name: Verify PostgreSQL replicas + hosts: postgresql_ro + become: yes + gather_facts: yes + tasks: + - name: Check replica recovery status community.postgresql.postgresql_query: - login_db: postgres + login_host: "127.0.0.1" + login_user: "{{ repmgr_user }}" + login_password: "{{ repmgr_password }}" + db: "{{ repmgr_database }}" query: | - SELECT - pg_size_pretty(sum(size)) as total_wal_size, - count(*) as wal_files, - CASE - WHEN sum(size) > 2147483648 THEN 'WARNING: >2GB WAL usage' - WHEN sum(size) > 1073741824 THEN 'CAUTION: >1GB WAL usage' - ELSE 'OK' - END as wal_status - FROM pg_ls_waldir(); - register: wal_usage - become: yes - become_user: postgres - - - name: Display WAL usage status + SELECT + pg_is_in_recovery() as is_replica, + pg_last_wal_receive_lsn() as last_wal_received, + pg_last_wal_replay_lsn() as last_wal_replayed, + CASE + WHEN pg_is_in_recovery() THEN 'REPLICA' + ELSE 'PRIMARY/STANDALONE' + END as node_role + register: replica_status + + - name: Display replica status + ansible.builtin.debug: + msg: | + Replica Status for {{ inventory_hostname }}: + - Role: {{ replica_status.query_result[0].node_role }} + - Is Replica: {{ replica_status.query_result[0].is_replica }} + - Last WAL Received: {{ replica_status.query_result[0].last_wal_received }} + - Last WAL Replayed: {{ replica_status.query_result[0].last_wal_replayed }} + + - name: Check replica lag + community.postgresql.postgresql_query: + login_host: "127.0.0.1" + login_user: "{{ repmgr_user }}" + login_password: "{{ repmgr_password }}" + db: "{{ repmgr_database }}" + query: | + SELECT + CASE + WHEN pg_is_in_recovery() THEN + pg_size_pretty( + pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) + ) + ELSE 'Not a replica' + END as replay_lag, + CASE + WHEN pg_is_in_recovery() THEN + CASE + WHEN pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) = 0 THEN 'UP_TO_DATE' + WHEN pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) < 1024*1024 THEN 'SLIGHT_LAG' + ELSE 'SIGNIFICANT_LAG' + END + ELSE 'NOT_REPLICA' + END as lag_status + register: replica_lag + + - name: Display replica lag information + ansible.builtin.debug: + msg: | + Replica Lag for {{ inventory_hostname }}: + - Replay Lag: {{ replica_lag.query_result[0].replay_lag }} + - Lag Status: {{ replica_lag.query_result[0].lag_status }} + + - name: Check repmgrd service status + ansible.builtin.systemd: + name: "repmgrd@{{ postgresql_version }}-main" + register: repmgrd_status + + - name: Display repmgrd status + ansible.builtin.debug: + msg: "repmgrd service: {{ repmgrd_status.status.ActiveState | default('unknown') }}" + when: repmgrd_status is defined + + - name: Test replica read-only access + community.postgresql.postgresql_query: + login_host: "127.0.0.1" + login_user: "{{ repmgr_user }}" + login_password: "{{ repmgr_password }}" + db: "{{ repmgr_database }}" + query: | + SELECT + 'Replica accessible' as status, + current_database() as database, + current_user as user, + inet_server_addr() as server_ip + register: replica_connectivity + + - name: Display replica connectivity ansible.builtin.debug: - var: wal_usage.query_result + msg: | + Connectivity Test for {{ inventory_hostname }}: + - Status: {{ replica_connectivity.query_result[0].status }} + - Database: {{ replica_connectivity.query_result[0].database }} + - Server IP: {{ replica_connectivity.query_result[0].server_ip }} diff --git a/ansible/postgresql-playbooks/postgresql-wire-setup.yml b/ansible/postgresql-playbooks/postgresql-wire-setup.yml index d1393d26c..580ddcff0 100644 --- a/ansible/postgresql-playbooks/postgresql-wire-setup.yml +++ b/ansible/postgresql-playbooks/postgresql-wire-setup.yml @@ -1,79 +1,178 @@ --- - name: Create PostgreSQL database and user for wire-server hosts: postgresql_rw + become: yes + gather_facts: yes + vars: + pg_secret_name: wire-postgresql-external-secret tasks: + - name: Validate kubectl is accessible + ansible.builtin.shell: kubectl cluster-info + register: kubectl_check + delegate_to: localhost + run_once: true + failed_when: false + changed_when: false + + - name: Fail if kubectl is not accessible + ansible.builtin.fail: + msg: | + ERROR: Cannot access Kubernetes cluster! + + Please ensure: + 1. kubectl is installed and accessible + 2. KUBECONFIG is set: export KUBECONFIG=/path/to/kubeconfig + 3. kubectl can connect: kubectl cluster-info + + Current KUBECONFIG: {{ lookup('env', 'KUBECONFIG') | default('not set') }} + Error: {{ kubectl_check.stderr | default('unknown') }} + when: kubectl_check.rc != 0 + delegate_to: localhost + run_once: true + - name: Check if PostgreSQL database exists - community.postgresql.postgresql_query: - query: "SELECT 1 FROM pg_database WHERE datname = '{{ wire_dbname }}'" - become: yes - become_user: postgres + ansible.builtin.shell: | + sudo -u postgres psql -c "SELECT 1 FROM pg_database WHERE datname = '{{ wire_dbname }}'" | grep -q '1 row' register: db_check_result - ignore_errors: yes + failed_when: false + changed_when: false - name: Create PostgreSQL database - community.postgresql.postgresql_db: - name: "{{ wire_dbname }}" - state: present - become: yes - become_user: postgres - when: db_check_result.rowcount == 0 + ansible.builtin.shell: | + sudo -u postgres createdb "{{ wire_dbname }}" + register: create_db_result + failed_when: create_db_result.rc != 0 and 'already exists' not in create_db_result.stderr + when: db_check_result.rc != 0 - name: Display message if database already exists - debug: + ansible.builtin.debug: msg: "PostgreSQL database '{{ wire_dbname }}' already exists." - when: db_check_result.rowcount > 0 + when: db_check_result.rc == 0 - name: Check if PostgreSQL user exists - community.postgresql.postgresql_query: - query: "SELECT 1 FROM pg_roles WHERE rolname = '{{ wire_user }}'" - become: yes - become_user: postgres + ansible.builtin.shell: | + sudo -u postgres psql -c "SELECT 1 FROM pg_roles WHERE rolname = '{{ wire_user }}'" | grep -q '1 row' register: user_check_result - ignore_errors: yes + failed_when: false + changed_when: false - name: Display message if user already exists - debug: + ansible.builtin.debug: msg: "PostgreSQL user '{{ wire_user }}' already exists." - when: user_check_result.rowcount > 0 + when: user_check_result.rc == 0 + + - name: Check if Kubernetes Secret exists + ansible.builtin.shell: | + kubectl get secret {{ pg_secret_name }} -n {{ wire_namespace | default('default') }} -o jsonpath='{.data.password}' 2>/dev/null | base64 -d + register: k8s_secret_check + delegate_to: localhost + run_once: true + failed_when: false + changed_when: false - - name: Generate random password if wire_pass is not defined - set_fact: - wire_pass: "{{ lookup('password', '/dev/null length=15 chars=ascii_letters,digits') }}" + - name: Retrieve password from existing Kubernetes Secret + ansible.builtin.set_fact: + wire_pass: "{{ k8s_secret_check.stdout }}" + password_source: "kubernetes-secret" when: - - wire_pass is not defined or wire_pass == "" - - user_check_result.rowcount == 0 - - - name: Create PostgreSQL user - community.postgresql.postgresql_user: - name: "{{ wire_user }}" - password: "{{ wire_pass }}" - db: "{{ wire_dbname }}" - state: present - become: yes - become_user: postgres + - k8s_secret_check.rc == 0 + - k8s_secret_check.stdout != "" + run_once: true + + - name: Generate random strong password if secret doesn't exist + ansible.builtin.set_fact: + wire_pass: "{{ lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}" + password_generated: true + when: + - k8s_secret_check.rc != 0 or k8s_secret_check.stdout == "" + run_once: true + + - name: Create Kubernetes Secret with generated password + ansible.builtin.shell: | + kubectl create secret generic {{ pg_secret_name }} \ + --namespace={{ wire_namespace | default('default') }} \ + --from-literal=password='{{ wire_pass }}' \ + --from-literal=username='{{ wire_user }}' \ + --from-literal=database='{{ wire_dbname }}' \ + --dry-run=client -o yaml | \ + kubectl label --local -f - \ + app=wire-server \ + component=postgresql \ + managed-by=ansible \ + --dry-run=client -o yaml | \ + kubectl apply -f - + delegate_to: localhost + run_once: true + when: + - password_generated is defined + - password_generated + + - name: Create PostgreSQL user (if not exists) + ansible.builtin.shell: | + sudo -u postgres psql -c "CREATE USER \"{{ wire_user }}\" WITH PASSWORD '{{ wire_pass }}';" register: user_creation_result - ignore_errors: yes - when: user_check_result.rowcount == 0 - - - name: Grant privileges to the user - community.postgresql.postgresql_privs: - database: "{{ wire_dbname }}" - roles: "{{ wire_user }}" - privs: ALL - type: database - become: yes - become_user: postgres - when: user_creation_result.changed + failed_when: user_creation_result.rc != 0 and 'already exists' not in user_creation_result.stderr + when: user_check_result.rc != 0 + + - name: Reset password for existing user + ansible.builtin.shell: | + sudo -u postgres psql -c "ALTER USER \"{{ wire_user }}\" WITH PASSWORD '{{ wire_pass }}';" + when: user_check_result.rc == 0 + + - name: Grant database privileges to the user + ansible.builtin.shell: | + sudo -u postgres psql -c "GRANT ALL PRIVILEGES ON DATABASE \"{{ wire_dbname }}\" TO \"{{ wire_user }}\";" - name: Grant CREATE on public schema to the user - community.postgresql.postgresql_query: - db: "{{ wire_dbname }}" - query: "GRANT CREATE ON SCHEMA public TO \"{{ wire_user }}\";" - become: yes - become_user: postgres - when: user_creation_result.changed - - - name: Display PostgreSQL user credentials if creation was successful - debug: - msg: "PostgreSQL user '{{ wire_user }}' created successfully. Password: {{ wire_pass }}" - when: user_creation_result is defined and user_creation_result.changed + ansible.builtin.shell: | + sudo -u postgres psql -d "{{ wire_dbname }}" -c "GRANT CREATE ON SCHEMA public TO \"{{ wire_user }}\";" + + - name: Grant USAGE on public schema to the user + ansible.builtin.shell: | + sudo -u postgres psql -d "{{ wire_dbname }}" -c "GRANT USAGE ON SCHEMA public TO \"{{ wire_user }}\";" + + - name: Set default privileges for future tables + ansible.builtin.shell: | + sudo -u postgres psql -d "{{ wire_dbname }}" -c "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO \"{{ wire_user }}\";" + + - name: Set default privileges for future sequences + ansible.builtin.shell: | + sudo -u postgres psql -d "{{ wire_dbname }}" -c "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO \"{{ wire_user }}\";" + + - name: Display PostgreSQL setup completion + ansible.builtin.debug: + msg: "{{ setup_completion_msg.split('\n') }}" + vars: + setup_completion_msg: | + PostgreSQL setup completed: + Database: {{ wire_dbname }} + User: {{ wire_user }} + Status: {% if user_check_result.rc != 0 %}User created{% else %}Password updated for existing user{% endif %} + + Kubernetes Secret: {{ pg_secret_name }} + Namespace: {{ wire_namespace | default('default') }} + Contains: password, username, database, host, port + {% if password_generated is defined %} + Password: Randomly generated 32-character strong password + Stored securely in Kubernetes Secret (no hardcoded passwords) + Access it via: kubectl get secret {{ pg_secret_name }} -n {{ wire_namespace | default('default') }} -o jsonpath='{.data.password}' | base64 --decode + {% elif password_source is defined %} + Password: Retrieved from existing Kubernetes Secret + Access it via: kubectl get secret {{ pg_secret_name }} -n {{ wire_namespace | default('default') }} -o jsonpath='{.data.password}' | base64 --decode + {% endif %} + + - name: Verify database and user setup + ansible.builtin.shell: | + PGPASSWORD='{{ wire_pass }}' psql -h {{ ansible_default_ipv4.address }} -U "{{ wire_user }}" -d "{{ wire_dbname }}" -c "SELECT 1" + register: verification_result + failed_when: false + + - name: Display verification status + ansible.builtin.debug: + msg: "✅ Database connection verified successfully" + when: verification_result.rc == 0 + + - name: Display verification failure + ansible.builtin.debug: + msg: "❌ Database connection failed: {{ verification_result.stderr }}" + when: verification_result.rc != 0 diff --git a/ansible/templates/pg_hba.conf.j2 b/ansible/templates/pg_hba.conf.j2 deleted file mode 100644 index 29a469117..000000000 --- a/ansible/templates/pg_hba.conf.j2 +++ /dev/null @@ -1,27 +0,0 @@ -# PostgreSQL Client Authentication Configuration File -# {{ ansible_managed }} - -# TYPE DATABASE USER ADDRESS METHOD - -# "local" is for Unix domain socket connections only -local all all peer - -# IPv4 local connections: -host all all 127.0.0.1/32 md5 - -# IPv6 local connections: -host all all ::1/128 md5 - -{% if inventory_hostname in groups['postgresql_rw'] %} -{# Allow replication connections from replica nodes #} -host replication {{ repsvc_user }} {{ replica_node1 }}/32 md5 -host replication {{ repsvc_user }} {{ replica_node2 }}/32 md5 -{% endif %} - -{% if inventory_hostname in groups['postgresql_rw'] or inventory_hostname in groups['postgresql_ro'] %} -{# Allow connections from the primary node network #} -host all all {{ primary_node }}/24 md5 -{% endif %} - -# Allow connections from application servers (adjust as needed) -# host all all 10.0.0.0/8 md5 diff --git a/ansible/templates/postgresql/detect-rogue-primary.service.j2 b/ansible/templates/postgresql/detect-rogue-primary.service.j2 new file mode 100644 index 000000000..966713908 --- /dev/null +++ b/ansible/templates/postgresql/detect-rogue-primary.service.j2 @@ -0,0 +1,28 @@ +# detect-rogue-primary.service.j2 +[Unit] +Description=PostgreSQL Split-Brain Detection Service +Documentation=man:systemd.service(5) +After=postgresql@{{ postgresql_version }}-main.service +Wants=postgresql@{{ postgresql_version }}-main.service + +[Service] +Type=oneshot +User=postgres +Group=postgres +WorkingDirectory=/var/lib/postgresql +ExecStart=/usr/local/bin/detect_rogue_primary.sh +StandardOutput=journal +StandardError=journal +TimeoutSec=60 +Environment=PGUSER=postgres +Environment=PGDATABASE={{ repmgr_database }} +Environment=PGCONNECT_TIMEOUT=5 + +# Only run if PostgreSQL is running +ExecCondition=/bin/systemctl is-active postgresql@{{ postgresql_version }}-main.service + +# Don't restart on failure - let timer handle next run +Restart=no + +[Install] +WantedBy=multi-user.target diff --git a/ansible/templates/postgresql/detect-rogue-primary.timer.j2 b/ansible/templates/postgresql/detect-rogue-primary.timer.j2 new file mode 100644 index 000000000..cf02a19d6 --- /dev/null +++ b/ansible/templates/postgresql/detect-rogue-primary.timer.j2 @@ -0,0 +1,24 @@ +# detect-rogue-primary.timer.j2 +[Unit] +Description=PostgreSQL Split-Brain Detection Timer +Documentation=man:systemd.timer(5) +Requires=detect-rogue-primary.service +After=postgresql@{{ postgresql_version }}-main.service +Wants=postgresql@{{ postgresql_version }}-main.service + +[Timer] +# Run every 30 seconds when PostgreSQL is active +OnCalendar=*:*:0/30 +AccuracySec=5s + +# Wait for PostgreSQL to be stable after boot +OnBootSec=120s + +# Don't catch up on missed runs if system was down +Persistent=false + +# Randomize execution to avoid simultaneous checks across nodes +RandomizedDelaySec=10s + +[Install] +WantedBy=timers.target \ No newline at end of file diff --git a/ansible/templates/postgresql/detect_rogue_primary.sh.j2 b/ansible/templates/postgresql/detect_rogue_primary.sh.j2 new file mode 100644 index 000000000..bae6e1e91 --- /dev/null +++ b/ansible/templates/postgresql/detect_rogue_primary.sh.j2 @@ -0,0 +1,102 @@ +#!/bin/bash +# detect_rogue_primary.sh - Split-brain monitoring tool +set -euo pipefail + +HOSTNAME="$(hostname)" + +# Cluster nodes from Ansible inventory (excluding myself) +CLUSTER_NODES=( +{%- set all_nodes = (groups.postgresql_rw | default([])) + (groups.postgresql_ro | default([])) %} +{%- for node in all_nodes %} +{%- if node != inventory_hostname %} + "{{ hostvars[node].ansible_default_ipv4.address | default(hostvars[node].ansible_host | default(node)) }}" +{%- endif %} +{%- endfor %} +) + +# Database connection settings +DB_USER="{{ repmgr_user }}" +DB_PASSWORD="{{ repmgr_password }}" +DB_NAME="{{ repmgr_database | default('postgres') }}" + +# Check if I'm primary with no replicas +IS_PRIMARY="$(psql -t -A -q -d "$DB_NAME" -c "SELECT NOT pg_is_in_recovery();" | tr -d '[:space:]')" +REPLICA_COUNT="$(psql -t -A -q -d "$DB_NAME" -c "SELECT COUNT(*) FROM pg_stat_replication;" | tr -d '[:space:]')" + +echo "[$HOSTNAME] I am primary: $IS_PRIMARY, Replica count: $REPLICA_COUNT" + +# Silent exit if not an isolated primary +if [[ "$IS_PRIMARY" != "t" || "$REPLICA_COUNT" != "0" ]]; then + echo "[$HOSTNAME] Not an isolated primary - no split-brain check needed" + exit 0 +fi + +echo "[$HOSTNAME] I'm an isolated primary - checking other cluster nodes..." +SPLIT_BRAIN_DETECTED=false + +# Check each cluster node +for NODE_IP in "${CLUSTER_NODES[@]}"; do + [[ -z "$NODE_IP" ]] && continue + + echo "[$HOSTNAME] Checking node $NODE_IP for primary status..." + + # Check if remote node is also primary + REMOTE_PRIMARY="$(PGPASSWORD="$DB_PASSWORD" psql -h "$NODE_IP" -U "$DB_USER" -d "$DB_NAME" \ + -t -A -q -c "SELECT NOT pg_is_in_recovery();" 2>/dev/null | tr -d '[:space:]')" || REMOTE_PRIMARY="" + + if [[ "$REMOTE_PRIMARY" == "t" ]]; then + echo "🚨 [$HOSTNAME] SPLIT-BRAIN DETECTED: Node $NODE_IP is also PRIMARY!" + SPLIT_BRAIN_DETECTED=true + break + elif [[ "$REMOTE_PRIMARY" == "f" ]]; then + echo "[$HOSTNAME] Node $NODE_IP is replica (good)" + else + echo "[$HOSTNAME] Node $NODE_IP is unreachable or returned no status" + fi +done + +if [[ "$SPLIT_BRAIN_DETECTED" == "true" ]]; then + echo "[$HOSTNAME] 🛑 STOPPING POSTGRESQL TO RESOLVE SPLIT-BRAIN" + logger "Split-brain detected on $HOSTNAME - stopping PostgreSQL service" + + # Mask to prevent restart attempts + echo "[$HOSTNAME] Masking PostgreSQL service to prevent restart..." + sudo systemctl mask postgresql@{{ postgresql_version }}-main.service || { + echo "[$HOSTNAME] Warning: Failed to mask PostgreSQL service" + } + + # Then stop PostgreSQL service + echo "[$HOSTNAME] Stopping PostgreSQL service..." + sudo systemctl stop postgresql@{{ postgresql_version }}-main.service || { + echo "[$HOSTNAME] Warning: Failed to stop PostgreSQL service normally" + } + + # Verify the stop actually worked + echo "[$HOSTNAME] Verifying PostgreSQL has stopped..." + sleep 2 + + if systemctl is-active --quiet postgresql@{{ postgresql_version }}-main.service; then + echo "[$HOSTNAME] ⚠️ PostgreSQL still active, attempting force stop..." + sudo systemctl kill postgresql@{{ postgresql_version }}-main.service || { + echo "[$HOSTNAME] Warning: Force kill command failed" + } + sleep 2 + + if systemctl is-active --quiet postgresql@{{ postgresql_version }}-main.service; then + echo "[$HOSTNAME] ❌ Failed to stop PostgreSQL - manual intervention required" + logger "CRITICAL: Failed to stop PostgreSQL during split-brain resolution on $HOSTNAME" + else + echo "[$HOSTNAME] ✅ PostgreSQL successfully stopped after force kill" + fi + else + echo "[$HOSTNAME] ✅ PostgreSQL successfully stopped" + fi + + echo "[$HOSTNAME] PostgreSQL stopped and masked - manual intervention required" + exit 0 +else + echo "[$HOSTNAME] ✅ No split-brain detected - I'm the only primary in reachable nodes" +fi + +echo "[$HOSTNAME] Split-brain monitoring check completed" +exit 0 \ No newline at end of file diff --git a/ansible/templates/postgresql/failover_validation.sh.j2 b/ansible/templates/postgresql/failover_validation.sh.j2 new file mode 100644 index 000000000..1963409cc --- /dev/null +++ b/ansible/templates/postgresql/failover_validation.sh.j2 @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# failure_validation.sh — repmgr failover gate +# usage: failure_validation.sh +# exit 0 => allow promotion; non-zero => veto + +set -u +set -o pipefail + +NODE_ID="${1:-}"; VISIBLE="${2:-0}"; TOTAL="${3:-0}" +PSQL=${PSQL:-/usr/bin/psql} +DBNAME=${DBNAME:-postgres} + +# Log to syslog; never fail the script if logging fails +log(){ logger -t failure_validation -- "$*" || true; } + +# 1) Minimal quorum: for 3+ nodes, require ≥2 visible +if [[ "$TOTAL" -ge 3 && "$VISIBLE" -lt 2 ]]; then + log "Reject: insufficient visible nodes (visible=$VISIBLE,total=$TOTAL)" + echo "Reject: insufficient visible nodes" + exit 1 +fi + +# 2) Must still be a standby (only promote from recovery) +if ! "$PSQL" -X -Atqc "select pg_is_in_recovery();" -d "$DBNAME" | grep -qx 't'; then + echo "Reject: not in recovery (already primary?)" + exit 1 +fi + +# 3) Advisory checks (do NOT veto) +if ! "$PSQL" -X -Atqc "select 1 from pg_stat_wal_receiver limit 1;" -d "$DBNAME" >/dev/null 2>&1; then + log "Warn: WAL receiver not active" +fi + +LAG_CAP=${LAG_CAP:-67108864} # 64MB default +DELAY=$("$PSQL" -X -Atqc "select coalesce(pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()),0);" -d "$DBNAME" 2>/dev/null || echo 0) +if [[ "$DELAY" =~ ^[0-9]+$ ]] && (( DELAY > LAG_CAP )); then + log "Warn: replay delay ${DELAY} > ${LAG_CAP}" +fi + +echo "OK: promote node ${NODE_ID}" +exit 0 \ No newline at end of file diff --git a/ansible/templates/postgresql/pg_hba.conf.j2 b/ansible/templates/postgresql/pg_hba.conf.j2 new file mode 100644 index 000000000..112131dd0 --- /dev/null +++ b/ansible/templates/postgresql/pg_hba.conf.j2 @@ -0,0 +1,24 @@ +# PostgreSQL Client Authentication Configuration File +# {{ ansible_managed }} + +# TYPE DATABASE USER ADDRESS METHOD + +# PostgreSQL HBA configuration +# Local connections +local all postgres peer +local all all peer + +{% set network_subnet = primary_node | ipaddr('network') + '/24' %} +# repmgr metadata connections +local {{ repmgr_database }} {{ repmgr_user }} md5 +host {{ repmgr_database }} {{ repmgr_user }} 127.0.0.1/32 md5 +host {{ repmgr_database }} {{ repmgr_user }} {{ network_subnet | default('10.0.0.0/8') }} md5 + +# repmgr streaming replication connections +local replication {{ repmgr_user }} md5 +host replication {{ repmgr_user }} 127.0.0.1/32 md5 +host replication {{ repmgr_user }} {{ network_subnet | default('10.0.0.0/8') }} md5 + + +# Application access for the network +host all all {{ network_subnet | default('10.0.0.0/8') }} md5 \ No newline at end of file diff --git a/ansible/templates/postgresql/pgpass.j2 b/ansible/templates/postgresql/pgpass.j2 new file mode 100644 index 000000000..be3c87d4e --- /dev/null +++ b/ansible/templates/postgresql/pgpass.j2 @@ -0,0 +1,4 @@ +{% for host in groups['postgresql_rw'] + groups['postgresql_ro'] %} +{{ hostvars[host]['ansible_default_ipv4']['address'] | default(hostvars[host]['ansible_host']) }}:5432:{{ repmgr_database }}:{{ repmgr_user }}:{{ repmgr_password }} +{{ hostvars[host]['ansible_default_ipv4']['address'] | default(hostvars[host]['ansible_host']) }}:5432:replication:{{ repmgr_user }}:{{ repmgr_password }} +{% endfor %} \ No newline at end of file diff --git a/ansible/templates/postgresql/postgresql.conf.j2 b/ansible/templates/postgresql/postgresql.conf.j2 new file mode 100644 index 000000000..3f122c03d --- /dev/null +++ b/ansible/templates/postgresql/postgresql.conf.j2 @@ -0,0 +1,186 @@ +# postgresql.conf.j2 - Unified configuration for 3-node PostgreSQL cluster with repmgr +# {{ ansible_managed }} +# Hardware: 1GB RAM, 1 Core, 50GB Disk +# Cluster: 1 Primary + 2 Standby nodes (no witness) + +# ==================================================================== +# FILE LOCATIONS +# Ref: https://www.postgresql.org/docs/17/runtime-config-file-locations.html +# ==================================================================== +data_directory = '/var/lib/postgresql/{{ postgresql_version }}/main' +hba_file = '/etc/postgresql/{{ postgresql_version }}/main/pg_hba.conf' +ident_file = '/etc/postgresql/{{ postgresql_version }}/main/pg_ident.conf' +external_pid_file = '/var/run/postgresql/{{ postgresql_version }}-main.pid' + +# ==================================================================== +# CONNECTIONS AND AUTHENTICATION +# Ref: https://www.postgresql.org/docs/17/runtime-config-connection.html +# ==================================================================== +listen_addresses = '*' # Accept connections from any IP +port = 5432 +max_connections = 20 # Conservative for 1GB RAM +superuser_reserved_connections = 3 # Reserve for maintenance/repmgr + +# REQUIRED by repmgr - must include 'repmgr' +# Ref: https://www.repmgr.org/docs/current/configuration-prerequisites.html +shared_preload_libraries = 'pg_stat_statements,repmgr' + +# ==================================================================== +# RESOURCE USAGE (1GB RAM, 1 Core) +# Ref: https://www.postgresql.org/docs/17/runtime-config-resource.html +# ==================================================================== +shared_buffers = 256MB # 25% of RAM (PG recommendation) +effective_cache_size = 512MB # 50% of RAM (OS cache estimate) +work_mem = 2MB # RAM/connections/complexity_factor +maintenance_work_mem = 64MB # For VACUUM, CREATE INDEX +wal_buffers = -1 # Auto-tune: 3% of shared_buffers + +# Worker processes - limited by 1 core +# Ref: https://www.postgresql.org/docs/17/runtime-config-resource.html#GUC-MAX-WORKER-PROCESSES +max_worker_processes = 2 # Min for repmgr operations +max_parallel_workers = 1 # Limited by single core +max_parallel_workers_per_gather = 0 # Disable parallel queries (1 core) + +# ==================================================================== +# WRITE-AHEAD LOG +# Ref: https://www.postgresql.org/docs/17/runtime-config-wal.html +# ==================================================================== + +# REQUIRED by repmgr +# Ref: https://www.repmgr.org/docs/current/configuration-prerequisites.html +wal_level = replica # Required: minimum 'replica' for replication +wal_log_hints = on # Required: enables pg_rewind for failback + +# Replication slots and senders +# Ref: https://www.repmgr.org/docs/current/configuration-prerequisites.html#CONFIGURATION +max_wal_senders = 10 # Required: min 2, repmgr recommends 10 +max_replication_slots = 10 # Required: for replication slot usage +wal_keep_size = 2GB # 4% of 50GB disk (PG 13+) +max_slot_wal_keep_size = 3GB # Safety limit: 6% of disk + +# WAL writing performance +# Ref: https://www.postgresql.org/docs/17/wal-configuration.html +wal_sync_method = fdatasync # Best for Linux ext4/xfs +wal_writer_delay = 200ms # Frequency of WAL flushes +wal_writer_flush_after = 1MB +wal_compression = on # Save disk space (important for 50GB) +commit_delay = 0 # No artificial delay +commit_siblings = 5 + +# Archiving - recommended by repmgr for PITR capability +# Ref: https://www.repmgr.org/docs/current/configuration-prerequisites.html +archive_mode = on # Enable without restart requirement +archive_command = '/bin/true' # Placeholder for air-gapped environment + +# ==================================================================== +# REPLICATION +# Ref: https://www.postgresql.org/docs/17/runtime-config-replication.html +# ==================================================================== + +# Synchronous replication - critical for 3-node cluster without witness +# Ref: https://www.postgresql.org/docs/17/warm-standby.html#SYNCHRONOUS-REPLICATION +synchronous_standby_names = '' +synchronous_commit = on # Wait for standby confirmation + +# Primary server +# Ref: https://www.postgresql.org/docs/17/runtime-config-replication.html#GUC-WAL-SENDER-TIMEOUT +wal_sender_timeout = 30s # Detect dead standby connections faster + +# Standby servers +# Ref: https://www.postgresql.org/docs/17/hot-standby.html +hot_standby = on # REQUIRED by repmgr: allow queries on standby +hot_standby_feedback = on # Prevent query conflicts +wal_receiver_status_interval = 10s # Status update frequency +wal_receiver_timeout = 30s # Reconnect if no data received +max_standby_streaming_delay = 30s # Max lag before canceling queries +max_standby_archive_delay = 30s + +# ==================================================================== +# CHECKPOINTS +# Ref: https://www.postgresql.org/docs/17/wal-configuration.html +# ==================================================================== +checkpoint_completion_target = 0.9 # Spread checkpoint I/O +checkpoint_timeout = 15min # Maximum time between checkpoints +max_wal_size = 1GB # 2% of disk - trigger checkpoint +min_wal_size = 256MB # Minimum WAL retained +checkpoint_flush_after = 256kB # Force OS writeback + +# ==================================================================== +# QUERY PLANNER +# Ref: https://www.postgresql.org/docs/17/runtime-config-query.html +# ==================================================================== +random_page_cost = 1.1 # For SSD (default 4.0 for HDD) +effective_io_concurrency = 1 # Single disk +maintenance_io_concurrency = 1 + +# ==================================================================== +# BACKGROUND WRITER +# Ref: https://www.postgresql.org/docs/17/runtime-config-resource.html#RUNTIME-CONFIG-RESOURCE-BACKGROUND-WRITER +# ==================================================================== +bgwriter_delay = 200ms +bgwriter_lru_maxpages = 100 +bgwriter_lru_multiplier = 2.0 +bgwriter_flush_after = 512kB + +# ==================================================================== +# LOGGING +# Ref: https://www.postgresql.org/docs/17/runtime-config-logging.html +# ==================================================================== +logging_collector = on +log_directory = 'log' +log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log' +log_rotation_age = 1d +log_rotation_size = 100MB + +# What to log +log_min_duration_statement = 1000 # Log slow queries (>1s) +log_line_prefix = '%t [%p]: user=%u,db=%d,app=%a,client=%h ' +log_statement = 'ddl' # Log schema changes +log_replication_commands = on # Monitor replication +log_checkpoints = on # Monitor checkpoint performance +log_connections = on # Security auditing +log_disconnections = on +log_lock_waits = on # Performance issues +log_recovery_conflict_waits = on # Standby conflicts +log_temp_files = 0 # All temp file usage +log_autovacuum_min_duration = 0 # All autovacuum activity + +# ==================================================================== +# STATISTICS +# Ref: https://www.postgresql.org/docs/17/runtime-config-statistics.html +# ==================================================================== +track_activities = on +track_counts = on +track_io_timing = off # Disabled to reduce overhead +track_functions = none + +# ==================================================================== +# AUTOVACUUM +# Ref: https://www.postgresql.org/docs/17/runtime-config-autovacuum.html +# ==================================================================== +autovacuum = on +autovacuum_max_workers = 2 # Balance between maintenance and resources +autovacuum_naptime = 60s # Check interval +autovacuum_vacuum_threshold = 50 # Min changes before vacuum +autovacuum_vacuum_scale_factor = 0.2 # 20% of table size +autovacuum_analyze_threshold = 50 # Min changes before analyze +autovacuum_analyze_scale_factor = 0.1 # 10% of table size +autovacuum_work_mem = -1 # Use maintenance_work_mem + +# ==================================================================== +# CLIENT CONNECTION DEFAULTS +# Ref: https://www.postgresql.org/docs/17/runtime-config-client.html +# ==================================================================== +idle_in_transaction_session_timeout = 600s # Kill idle transactions after 10min + +# ==================================================================== +# LOCALE AND FORMATTING +# Ref: https://www.postgresql.org/docs/17/runtime-config-preset.html +# ==================================================================== +datestyle = 'iso, mdy' +timezone = 'UTC' +lc_messages = 'en_US.UTF-8' +lc_monetary = 'en_US.UTF-8' +lc_numeric = 'en_US.UTF-8' +lc_time = 'en_US.UTF-8' +default_text_search_config = 'pg_catalog.english' \ No newline at end of file diff --git a/ansible/templates/postgresql/repmgr.conf.j2 b/ansible/templates/postgresql/repmgr.conf.j2 new file mode 100644 index 000000000..bad9b5a17 --- /dev/null +++ b/ansible/templates/postgresql/repmgr.conf.j2 @@ -0,0 +1,87 @@ +# repmgr.conf.j2 - with documentation references +# {{ ansible_managed }} + +# ==================================================================== +# NODE IDENTIFICATION +# Ref: https://www.repmgr.org/docs/current/configuration-file.html +# ==================================================================== +{% set node_config = repmgr_node_config[inventory_hostname] | default({}) %} +node_id={{ node_config.node_id | default(1) }} +node_name='{{ inventory_hostname }}' +{% if node_config.priority is defined %} +priority={{ node_config.priority }} +{% endif %} + +# ==================================================================== +# CONNECTION SETTINGS +# Ref: https://www.repmgr.org/docs/current/configuration-file.html +# ==================================================================== +conninfo='host={{ ansible_default_ipv4.address | default(ansible_host) }} user={{ repmgr_user }} dbname={{ repmgr_database }} password={{ repmgr_password }} connect_timeout=2' + +# ==================================================================== +# POSTGRESQL PATHS +# ==================================================================== +data_directory='{{ postgresql_data_dir }}' +config_directory='{{ postgresql_conf_dir }}' +pg_bindir='/usr/lib/postgresql/{{ postgresql_version }}/bin' +passfile='/var/lib/postgresql/.pgpass' + +# ==================================================================== +# REPLICATION +# ==================================================================== +use_replication_slots=yes +monitoring_history=true + +# ==================================================================== +# AUTOMATIC FAILOVER +# Ref: https://www.repmgr.org/docs/current/repmgrd-basic-configuration.html +# ==================================================================== +failover=automatic +primary_visibility_consensus=true +failover_validation_command='/opt/repmgr/scripts/failover_validation.sh %n %v %t' +repmgrd_exit_on_inactive_node=true + +# Promotion and follow commands +# Ref: https://github.com/EnterpriseDB/repmgr/blob/master/repmgr.conf.sample +promote_command='/usr/bin/repmgr standby promote -f /etc/repmgr/{{ postgresql_version }}-main/repmgr.conf --log-to-file' +follow_command='/usr/bin/repmgr standby follow -f /etc/repmgr/{{ postgresql_version }}-main/repmgr.conf --upstream-node-id=%n --log-to-file' + +# ==================================================================== +# SERVICE MANAGEMENT COMMANDS +# Ref: https://www.repmgr.org/docs/current/configuration-file-service-commands.html +# ==================================================================== + +service_start_command='sudo systemctl start postgresql@{{ postgresql_version }}-main' +service_stop_command='sudo systemctl stop postgresql@{{ postgresql_version }}-main' +service_restart_command='sudo systemctl restart postgresql@{{ postgresql_version }}-main' +service_reload_command='sudo systemctl reload postgresql@{{ postgresql_version }}-main' + +# ==================================================================== +# EVENT NOTIFICATION +# ==================================================================== +event_notification_command='/opt/repmgr/scripts/simple_fence.sh %n %e %s' + +# ==================================================================== +# MONITORING +# Ref: https://www.repmgr.org/docs/current/repmgrd-monitoring.html +# ==================================================================== +monitor_interval_secs={{ monitor_interval_secs | default(2) }} +reconnect_attempts={{ reconnect_attempts | default(6) }} +reconnect_interval={{ reconnect_interval | default(5) }} +standby_disconnect_on_failover=true + +# ==================================================================== +# REPMGRD SERVICE MANAGEMENT +# Ref: https://github.com/EnterpriseDB/repmgr/blob/master/repmgr.conf.sample +# ==================================================================== +repmgrd_service_start_command='sudo systemctl start repmgrd@{{ postgresql_version }}-main' +repmgrd_service_stop_command='sudo systemctl stop repmgrd@{{ postgresql_version }}-main' +repmgrd_pid_file='/tmp/repmgrd-{{ postgresql_version }}-main.pid' + +# ==================================================================== +# LOGGING (OPTIONAL BUT RECOMMENDED) +# ==================================================================== +log_level='INFO' +log_facility='LOCAL1' +log_file='/var/log/postgresql/repmgr-{{ postgresql_version }}-main.log' +log_status_interval=300 \ No newline at end of file diff --git a/ansible/templates/postgresql/repmgrd_service.j2 b/ansible/templates/postgresql/repmgrd_service.j2 new file mode 100644 index 000000000..c8c00677e --- /dev/null +++ b/ansible/templates/postgresql/repmgrd_service.j2 @@ -0,0 +1,15 @@ +[Unit] +Description=Repmgr failover daemon (instance %i) +After=network.target postgresql@%i.service +Wants=postgresql@%i.service + +[Service] +Type=forking +User=postgres +ExecStart=/usr/bin/repmgrd -f /etc/repmgr/%i/repmgr.conf --daemonize +PIDFile=/tmp/repmgrd-%i.pid +Restart=on-failure +RestartSec=5 + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/ansible/templates/postgresql/simple_fence.sh.j2 b/ansible/templates/postgresql/simple_fence.sh.j2 new file mode 100644 index 000000000..effc2a1f1 --- /dev/null +++ b/ansible/templates/postgresql/simple_fence.sh.j2 @@ -0,0 +1,85 @@ +#!/bin/bash +# simple_fence.sh — basic event handler for repmgr + +set -euo pipefail + +# --- CONFIG (templated) --- +PGUSER="{{ repmgr_user }}" +PGDATABASE="{{ repmgr_database }}" +LOGFILE="/var/log/postgresql/fence_events.log" +SCRIPT_NAME="simple_fence" +LOCAL_NODE_ID="{{ repmgr_node_config[inventory_hostname].node_id if repmgr_node_config is defined and repmgr_node_config.get(inventory_hostname) and repmgr_node_config[inventory_hostname].get('node_id') else '1' }}" + +# Node mappings (id → ip/name), generated from inventory if available +declare -A NODE_HOSTS=({% set nodes = ((groups.postgresql_rw|default([])) + (groups.postgresql_ro|default([]))) -%} +{%- for h in nodes -%} +[{{ (repmgr_node_config[h].node_id if repmgr_node_config is defined and repmgr_node_config.get(h) and repmgr_node_config[h].get('node_id') else loop.index) }}]="{{ hostvars[h].ansible_default_ipv4.address | default(hostvars[h].ansible_host | default(h)) }}"{% if not loop.last %} {% endif %} +{%- endfor -%}) +declare -A NODE_NAMES=({% for h in nodes -%} +[{{ (repmgr_node_config[h].node_id if repmgr_node_config is defined and repmgr_node_config.get(h) and repmgr_node_config[h].get('node_id') else loop.index) }}]="{{ h }}"{% if not loop.last %} {% endif %} +{%- endfor -%}) + +# --- Logging --- +mkdir -p "$(dirname "$LOGFILE")" +log_event(){ printf '%s [%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$SCRIPT_NAME" "$1" >>"$LOGFILE" || true; } + +# --- DB helpers --- +psql_q(){ psql -X -qAt -v ON_ERROR_STOP=1 -U "$PGUSER" -d "$PGDATABASE" "$@" 2>/dev/null; } +update_node_status(){ + local id="$1" status="$2" reason="$3" name="${NODE_NAMES[$id]}" + local q="update repmgr.nodes set active=${status} where node_id=${id};" + log_event "Set active=${status} for ${name} (ID:${id}) — ${reason}" + for h in "${NODE_HOSTS[@]}"; do + if psql_q -h "$h" -c "$q" >/dev/null; then log_event "Metadata updated via $h"; return 0; fi + done + log_event "ERROR: failed to update metadata on any host"; return 1 +} + +# --- Event args --- +NODE_ID="${1:-}"; EVENT="${2:-}"; SUCCESS="${3:-}" +[ -n "$NODE_ID" ] || exit 0 +[ -v NODE_NAMES["$NODE_ID"] ] || { log_event "ERROR: unknown node id $NODE_ID"; exit 1; } + +log_event "=== EVENT === id=$NODE_ID name=${NODE_NAMES[$NODE_ID]} event=$EVENT success=$SUCCESS" + +# --- Basic Event Handlers --- +handle_failover_promote(){ + log_event "Failover promote on ${NODE_NAMES[$NODE_ID]}" + [ "$SUCCESS" = "1" ] || { log_event "Promotion not successful; no action taken"; return; } + log_event "Promotion successful - cluster state updated by repmgr" +} + +handle_standby_promote(){ + if [ "$SUCCESS" = "1" ]; then + log_event "Standby promotion successful on node $NODE_ID" + handle_failover_promote + else + log_event "Standby promotion failed" + fi +} + +handle_node_rejoin(){ + if [ "$SUCCESS" = "1" ]; then + # Check if this node's PostgreSQL service is masked + if systemctl is-enabled postgresql@{{ postgresql_version }}-main.service | grep -q "masked"; then + log_event "PostgreSQL service is masked, unmasking for rejoin..." + sudo systemctl unmask postgresql@{{ postgresql_version }}-main.service + log_event "PostgreSQL service unmasked successfully" + fi + + update_node_status "$NODE_ID" "true" "Rejoined cluster successfully" + log_event "Node ${NODE_NAMES[$NODE_ID]} successfully rejoined" + else + log_event "Node rejoin failed" + fi +} + +case "$EVENT" in + standby_promote) handle_standby_promote ;; + failover) handle_failover_promote ;; + node_rejoin) handle_node_rejoin ;; + *) log_event "Event $EVENT logged (no specific action)" ;; +esac + +log_event "Event handled" +exit 0 \ No newline at end of file diff --git a/ansible/templates/postgresql_primary.conf.j2 b/ansible/templates/postgresql_primary.conf.j2 deleted file mode 100644 index aa6f6cddb..000000000 --- a/ansible/templates/postgresql_primary.conf.j2 +++ /dev/null @@ -1,120 +0,0 @@ -# PostgreSQL Configuration for Primary Node (Streaming Replication Optimized) -# {{ ansible_managed }} - -# Basic Settings -# https://www.postgresql.org/docs/17/runtime-config-file-locations.html -data_directory = '/var/lib/postgresql/{{ postgresql_version }}/main' -hba_file = '/etc/postgresql/{{ postgresql_version }}/main/pg_hba.conf' -ident_file = '/etc/postgresql/{{ postgresql_version }}/main/pg_ident.conf' -external_pid_file = '/var/run/postgresql/{{ postgresql_version }}-main.pid' - -# Connection Settings -# https://www.postgresql.org/docs/17/runtime-config-connection.html -listen_addresses = '*' -port = 5432 -max_connections = 20 -superuser_reserved_connections = 2 -shared_preload_libraries = 'pg_stat_statements' - -# Memory Settings (optimized for 1GB RAM, 1 core) -# https://www.postgresql.org/docs/17/runtime-config-resource.html -shared_buffers = 128MB # ~12.5% of total RAM (conservative for limited memory) -effective_cache_size = 512MB # ~50% of total RAM for OS cache -work_mem = 2MB # Limited for constrained memory -maintenance_work_mem = 32MB # Conservative for maintenance operations -wal_buffers = 4MB # Smaller WAL buffer -max_worker_processes = 1 # Match core count -max_parallel_workers = 1 # Match single core -max_parallel_workers_per_gather = 0 # Disable parallel workers for single core - -# Write-Ahead Logging (WAL) - Optimized for 50GB disk constraint -# https://www.postgresql.org/docs/17/runtime-config-wal.html -wal_level = replica -max_wal_senders = 4 # Limited for resource constraints -max_replication_slots = 4 # Conservative number of slots -wal_keep_size = 2GB # 4% of disk space for WAL retention -wal_sender_timeout = 60s -max_slot_wal_keep_size = 3GB # 6% of disk space maximum - -# WAL Writing and Flushing (for minimal latency) -wal_sync_method = fdatasync -wal_writer_delay = 200ms # More frequent WAL writes -wal_writer_flush_after = 1MB -commit_delay = 0 # No artificial delay -commit_siblings = 5 - -# Streaming Replication Settings - Synchronous for resource efficiency -# https://www.postgresql.org/docs/17/runtime-config-replication.html -synchronous_standby_names = '' # Async replication to reduce resource usage -synchronous_commit = on # Default to synchronous commit -wal_receiver_status_interval = 10s # Less frequent updates to save resources -max_standby_streaming_delay = 120s # Longer delays acceptable for resource constraints -max_standby_archive_delay = 120s -hot_standby_feedback = on # Prevent query conflicts on replicas - -# Checkpoints (optimized for limited disk I/O) -# https://www.postgresql.org/docs/17/runtime-config-wal.html#RUNTIME-CONFIG-WAL-CHECKPOINTS -checkpoint_completion_target = 0.9 # Slower completion for limited I/O -checkpoint_timeout = 15min # Longer intervals to reduce I/O load -max_wal_size = 512MB # 1% of disk space before checkpoint -min_wal_size = 128MB # Reasonable minimum -checkpoint_flush_after = 64kB # Smaller flushes for limited I/O - -# Background Writer -# https://www.postgresql.org/docs/17/runtime-config-resource.html#RUNTIME-CONFIG-RESOURCE-BACKGROUND-WRITER -bgwriter_delay = 200ms # More frequent background writes -bgwriter_lru_maxpages = 100 -bgwriter_lru_multiplier = 2.0 -bgwriter_flush_after = 256kB - -# Query Planner -# https://www.postgresql.org/docs/17/runtime-config-query.html#RUNTIME-CONFIG-QUERY-CONSTANTS -random_page_cost = 1.5 -effective_io_concurrency = 1 -maintenance_io_concurrency = 1 - -# Logging (focused on replication and queries) -# https://www.postgresql.org/docs/17/runtime-config-logging.html -log_destination = 'stderr' -logging_collector = on -log_directory = 'log' -log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log' -log_rotation_age = 1d -log_rotation_size = 50MB -log_min_duration_statement = 2000 # Log slower queries on replica -log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,client=%h,app=%a ' -log_statement = 'none' # Less logging on replica -log_replication_commands = on # Monitor replication -log_checkpoints = on -log_connections = off -log_disconnections = off -log_lock_waits = on -log_recovery_conflict_waits = on # Log recovery conflicts - -# Statistics -# https://www.postgresql.org/docs/17/runtime-config-statistics.html#RUNTIME-CONFIG-CUMULATIVE-STATISTICS -track_activities = on -track_counts = on -track_io_timing = off # Monitor I/O performance -track_functions = none - -# Autovacuum (tuned for resource constraints) -# https://www.postgresql.org/docs/17/routine-vacuuming.html -autovacuum = on -autovacuum_max_workers = 1 # Single worker for single core -autovacuum_naptime = 120s # Less frequent for resource conservation -autovacuum_vacuum_threshold = 100 # Higher thresholds -autovacuum_vacuum_scale_factor = 0.2 # Less aggressive -autovacuum_analyze_threshold = 100 -autovacuum_analyze_scale_factor = 0.15 -autovacuum_work_mem = 16MB # Reduced memory for autovacuum - -# Locale and Formatting -# https://www.postgresql.org/docs/17/locale.html -datestyle = 'iso, mdy' -timezone = 'UTC' -lc_messages = 'en_US.UTF-8' -lc_monetary = 'en_US.UTF-8' -lc_numeric = 'en_US.UTF-8' -lc_time = 'en_US.UTF-8' -default_text_search_config = 'pg_catalog.english' diff --git a/ansible/templates/postgresql_replica.conf.j2 b/ansible/templates/postgresql_replica.conf.j2 deleted file mode 100644 index df75e231c..000000000 --- a/ansible/templates/postgresql_replica.conf.j2 +++ /dev/null @@ -1,135 +0,0 @@ -# PostgreSQL Configuration for Replica Nodes (Streaming Replication Optimized) -# {{ ansible_managed }} - -# Basic Settings -# https://www.postgresql.org/docs/17/runtime-config-file-locations.html -data_directory = '/var/lib/postgresql/{{ postgresql_version }}/main' -hba_file = '/etc/postgresql/{{ postgresql_version }}/main/pg_hba.conf' -ident_file = '/etc/postgresql/{{ postgresql_version }}/main/pg_ident.conf' -external_pid_file = '/var/run/postgresql/{{ postgresql_version }}-main.pid' - -# Connection Settings -# https://www.postgresql.org/docs/17/runtime-config-connection.html -listen_addresses = '*' -port = 5432 -superuser_reserved_connections = 2 -shared_preload_libraries = 'pg_stat_statements' - -# Streaming replication configuration -# https://www.postgresql.org/docs/17/runtime-config-replication.html -primary_conninfo = 'host={{ primary_node }} port=5432 user={{ repsvc_user }} password={{ repsvc_password }} application_name={{ inventory_hostname }}' -primary_slot_name = '{{ inventory_hostname }}' - -# Memory Settings (optimized for 1GB RAM, 1 core) -# https://www.postgresql.org/docs/17/runtime-config-resource.html -shared_buffers = 128MB # ~12.5% of total RAM (conservative for limited memory) -effective_cache_size = 512MB # ~50% of total RAM for OS cache -work_mem = 2MB # Limited for constrained memory -maintenance_work_mem = 32MB # Conservative for maintenance operations -wal_buffers = 4MB # Smaller WAL buffer -max_worker_processes = 1 # Match core count -max_parallel_workers = 1 # Match single core -max_parallel_workers_per_gather = 0 # Disable parallel workers for single core - -# WAL Writing and Flushing (for minimal latency) -wal_sync_method = fdatasync # ADDED: Missing WAL sync method -wal_writer_delay = 200ms # ADDED: Less frequent for single core -wal_writer_flush_after = 1MB # ADDED: WAL writer flush setting -commit_delay = 0 # ADDED: No artificial delay -commit_siblings = 5 # ADDED: Commit siblings setting - -# Write-Ahead Logging (WAL) - Replica settings -# https://www.postgresql.org/docs/17/runtime-config-wal.html -wal_level = replica # Must match primary minimum -wal_keep_size = 500MB # Less than primary -max_slot_wal_keep_size = 1GB - -# Hot Standby Settings (optimized for resource constraints) -# https://www.postgresql.org/docs/17/runtime-config-replication.html -hot_standby = on # Enable read queries on replica -max_standby_streaming_delay = 120s # Longer delay acceptable for resource constraints -max_standby_archive_delay = 120s # Longer delay acceptable -hot_standby_feedback = on # Send feedback to prevent conflicts -wal_receiver_status_interval = 10s # Less frequent updates to save resources -wal_receiver_timeout = 60s # Match primary timeout -wal_retrieve_retry_interval = 10s # Less frequent retries to save resources - -# Recovery Settings -# https://www.postgresql.org/docs/17/runtime-config-wal.html#RUNTIME-CONFIG-WAL-ARCHIVE-RECOVERY -restore_command = '' # Not using archive recovery -recovery_end_command = '' -recovery_target_timeline = 'latest' # Always follow the latest timeline - -# Checkpoints (optimized for limited disk I/O) -# https://www.postgresql.org/docs/17/runtime-config-wal.html#RUNTIME-CONFIG-WAL-CHECKPOINTS -checkpoint_completion_target = 0.9 # Slower completion for limited I/O -checkpoint_timeout = 15min # Longer intervals to reduce I/O load -max_wal_size = 512MB # 1% of disk space before checkpoint -min_wal_size = 128MB # Reasonable minimum -checkpoint_flush_after = 64kB # Smaller flushes for limited I/O - -# Background Writer -# https://www.postgresql.org/docs/17/runtime-config-resource.html#RUNTIME-CONFIG-RESOURCE-BACKGROUND-WRITER -bgwriter_delay = 200ms # More frequent background writes -bgwriter_lru_maxpages = 100 -bgwriter_lru_multiplier = 2.0 -bgwriter_flush_after = 256kB - -# Query Planner -# https://www.postgresql.org/docs/17/runtime-config-query.html#RUNTIME-CONFIG-QUERY-CONSTANTS -random_page_cost = 1.5 -effective_io_concurrency = 1 -maintenance_io_concurrency = 1 - -# Logging (focused on replication and queries) -# https://www.postgresql.org/docs/17/runtime-config-logging.html -log_destination = 'stderr' -logging_collector = on -log_directory = 'log' -log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log' -log_rotation_age = 1d -log_rotation_size = 50MB -log_min_duration_statement = 2000 # Log slower queries on replica -log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,client=%h,app=%a ' -log_statement = 'none' # Less logging on replica -log_replication_commands = on # Monitor replication -log_checkpoints = on -log_connections = off -log_disconnections = off -log_lock_waits = on -log_recovery_conflict_waits = on # Log recovery conflicts - -# Statistics -# https://www.postgresql.org/docs/17/runtime-config-statistics.html#RUNTIME-CONFIG-CUMULATIVE-STATISTICS -track_activities = on -track_counts = on -track_io_timing = off # Monitor I/O performance -track_functions = none - -# Autovacuum (tuned for resource constraints) -# https://www.postgresql.org/docs/17/routine-vacuuming.html -autovacuum = on -autovacuum_max_workers = 1 # Single worker for single core -autovacuum_naptime = 120s # Less frequent for resource conservation -autovacuum_vacuum_threshold = 100 # Higher thresholds -autovacuum_vacuum_scale_factor = 0.2 # Less aggressive -autovacuum_analyze_threshold = 100 -autovacuum_analyze_scale_factor = 0.15 -autovacuum_work_mem = 16MB # Reduced memory for autovacuum - -# Read-only optimizations -# https://www.postgresql.org/docs/17/runtime-config-client.html -default_transaction_isolation = 'read committed' -statement_timeout = 30min # Prevent long-running read queries -lock_timeout = 30s # Prevent lock waits -idle_in_transaction_session_timeout = 10min - -# Locale and Formatting (match primary) -# https://www.postgresql.org/docs/17/locale.html -datestyle = 'iso, mdy' -timezone = 'UTC' -lc_messages = 'en_US.UTF-8' -lc_monetary = 'en_US.UTF-8' -lc_numeric = 'en_US.UTF-8' -lc_time = 'en_US.UTF-8' -default_text_search_config = 'pg_catalog.english' diff --git a/bin/offline-deploy.sh b/bin/offline-deploy.sh index bc38950c7..cd0da1157 100755 --- a/bin/offline-deploy.sh +++ b/bin/offline-deploy.sh @@ -2,7 +2,6 @@ set -euo pipefail - SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" # HACK: hack to stop ssh from idling the connection. Which it will do if there is no output. And ansible is not verbose enough @@ -11,12 +10,40 @@ loop_pid=$! trap 'kill "$loop_pid"' EXIT -ZAUTH_CONTAINER=$(sudo docker load -i $SCRIPT_DIR/../containers-adminhost/quay.io_wire_zauth_*.tar | awk '{print $3}') +# Load ZAUTH container only if not already present +if ! sudo docker images | grep -q "wire/zauth"; then + echo "Loading ZAUTH container..." + ZAUTH_CONTAINER=$(sudo docker load -i $SCRIPT_DIR/../containers-adminhost/quay.io_wire_zauth_*.tar | awk '{print $3}') +else + echo "ZAUTH container already loaded, skipping..." + ZAUTH_CONTAINER=$(sudo docker images --format "{{.Repository}}:{{.Tag}}" | grep "wire/zauth" | head -1) +fi export ZAUTH_CONTAINER -WSD_CONTAINER=$(sudo docker load -i $SCRIPT_DIR/../containers-adminhost/container-wire-server-deploy.tgz | awk '{print $3}') +# Load WSD container only if not already present +if ! sudo docker images | grep -q "wire-server-deploy"; then + echo "Loading WSD container..." + WSD_CONTAINER=$(sudo docker load -i $SCRIPT_DIR/../containers-adminhost/container-wire-server-deploy.tgz | awk '{print $3}') +else + echo "WSD container already loaded, skipping..." + WSD_CONTAINER=$(sudo docker images --format "{{.Repository}}:{{.Tag}}" | grep "wire-server-deploy" | head -1) +fi +# Create wire secrets ./bin/offline-secrets.sh sudo docker run --network=host -v $SSH_AUTH_SOCK:/ssh-agent -e SSH_AUTH_SOCK=/ssh-agent -v $PWD:/wire-server-deploy $WSD_CONTAINER ./bin/offline-cluster.sh + +# Sync PostgreSQL password from K8s secret to secrets.yaml +echo "Syncing PostgreSQL password from Kubernetes secret..." +sudo docker run --network=host -v $PWD:/wire-server-deploy $WSD_CONTAINER ./bin/sync-k8s-secret-to-wire-secrets.sh \ + wire-postgresql-external-secret \ + password \ + values/wire-server/secrets.yaml \ + .brig.secrets.pgPassword \ + .galley.secrets.pgPassword \ + .spar.secrets.pgPassword \ + .gundeck.secrets.pgPassword + + sudo docker run --network=host -v $PWD:/wire-server-deploy $WSD_CONTAINER ./bin/offline-helm.sh diff --git a/bin/sync-k8s-secret-to-wire-secrets.sh b/bin/sync-k8s-secret-to-wire-secrets.sh new file mode 100755 index 000000000..972f06fac --- /dev/null +++ b/bin/sync-k8s-secret-to-wire-secrets.sh @@ -0,0 +1,173 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Script to sync Kubernetes secret values to Wire server secrets YAML file +# Usage: sync-k8s-secret-to-wire-secrets.sh [yaml-path2] ... +# +# Example: +# sync-k8s-secret-to-wire-secrets.sh wire-postgresql-external-secret password values/wire-server/secrets.yaml \ +# .brig.secrets.pgPassword .galley.secrets.pgPassword + +usage() { + cat << EOF +Usage: $(basename "$0") ... + +Syncs a value from a Kubernetes secret to one or more paths in a YAML file. + +Arguments: + secret-name Name of the Kubernetes secret + secret-key Key within the secret to retrieve + yaml-file Path to the YAML file to update + yaml-path YAML path(s) to update (e.g., .brig.secrets.pgPassword) + +Options: + -n, --namespace Kubernetes namespace (default: default) + -h, --help Show this help message + +Examples: + # PostgreSQL password sync (most common) + $(basename "$0") wire-postgresql-external-secret password \\ + values/wire-server/secrets.yaml \\ + .brig.secrets.pgPassword .galley.secrets.pgPassword .spar.secrets.pgPassword .gundeck.secrets.pgPassword + + # RabbitMQ password sync + $(basename "$0") rabbitmq-secret password \\ + values/wire-server/secrets.yaml \\ + .brig.secrets.rabbitmq.password .galley.secrets.rabbitmq.password + + # Redis password sync + $(basename "$0") redis-secret password \\ + values/wire-server/secrets.yaml \\ + .brig.secrets.redis.password +EOF + exit 1 +} + +# Parse arguments +NAMESPACE="default" +while [[ $# -gt 0 ]]; do + case $1 in + -n|--namespace) + NAMESPACE="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + break + ;; + esac +done + +# Validate required arguments +if [ $# -lt 4 ]; then + echo "❌ ERROR: Missing required arguments" + echo "" + usage +fi + +SECRET_NAME="$1" +SECRET_KEY="$2" +YAML_FILE="$3" +shift 3 +YAML_PATHS=("$@") + +echo "==================================================" +echo "Wire Secrets Synchronization" +echo "==================================================" +echo "" +echo "Secret: $SECRET_NAME/$SECRET_KEY (namespace: $NAMESPACE)" +echo "Target: $YAML_FILE" +echo "Paths: ${YAML_PATHS[*]}" +echo "" + +# Check if kubectl is accessible +if ! kubectl cluster-info &> /dev/null; then + echo "❌ ERROR: Cannot access Kubernetes cluster" + echo " Ensure kubectl is configured and cluster is accessible" + exit 1 +fi +echo "✓ Kubernetes cluster is accessible" + +# Check if the K8s secret exists +if ! kubectl get secret "$SECRET_NAME" -n "$NAMESPACE" &> /dev/null; then + echo "❌ ERROR: Kubernetes secret '$SECRET_NAME' not found in namespace '$NAMESPACE'" + exit 1 +fi +echo "✓ Found Kubernetes secret: $SECRET_NAME" + +# Retrieve the value from K8s secret +SECRET_VALUE=$(kubectl get secret "$SECRET_NAME" -n "$NAMESPACE" -o jsonpath="{.data.$SECRET_KEY}" | base64 --decode) +if [ -z "$SECRET_VALUE" ]; then + echo "❌ ERROR: Retrieved value is empty (key '$SECRET_KEY' not found or empty)" + exit 1 +fi +echo "✓ Retrieved value from secret (${#SECRET_VALUE} chars)" + +# Check if YAML file exists +if [ ! -f "$YAML_FILE" ]; then + echo "❌ ERROR: YAML file not found: $YAML_FILE" + exit 1 +fi +echo "✓ Found YAML file: $YAML_FILE" + +# Backup the original file +cp "$YAML_FILE" "$YAML_FILE.bak" +echo "✓ Created backup: $YAML_FILE.bak" + +# Update all specified YAML paths +if command -v yq &> /dev/null; then + echo "✓ Using yq for YAML manipulation" + for yaml_path in "${YAML_PATHS[@]}"; do + echo " Updating: $yaml_path" + yq -y "$yaml_path = \"$SECRET_VALUE\"" "$YAML_FILE" > "$YAML_FILE.tmp" && mv "$YAML_FILE.tmp" "$YAML_FILE" + done +else + echo "❌ ERROR: yq is required for this script" + echo " Install yq: https://github.com/kislyuk/yq" + rm "$YAML_FILE.bak" + exit 1 +fi + +# Verify the update +echo "" +echo "Verification:" +SUCCESS=true +for yaml_path in "${YAML_PATHS[@]}"; do + # Use yq to extract the actual value at the specific path + if command -v yq &> /dev/null; then + EXTRACTED_VALUE=$(yq -r "$yaml_path" "$YAML_FILE" 2>/dev/null || echo "") + if [ "$EXTRACTED_VALUE" = "$SECRET_VALUE" ]; then + echo " ✓ $yaml_path: synced" + else + echo " ⚠ $yaml_path: verification failed (expected ${#SECRET_VALUE} chars, got ${#EXTRACTED_VALUE} chars)" + SUCCESS=false + fi + else + # Fallback verification (less reliable) + FIELD_NAME=$(echo "$yaml_path" | awk -F. '{print $NF}') + EXTRACTED_VALUE=$(grep "$FIELD_NAME:" "$YAML_FILE" | head -1 | awk '{print $2}' | tr -d '"' | tr -d "'" || echo "") + if [ "$EXTRACTED_VALUE" = "$SECRET_VALUE" ]; then + echo " ✓ $yaml_path: synced" + else + echo " ⚠ $yaml_path: verification inconclusive (fallback method)" + SUCCESS=false + fi + fi +done + +echo "" +if [ "$SUCCESS" = true ]; then + echo "✅ SUCCESS: All paths synchronized" +else + echo "⚠️ WARNING: Verification inconclusive for some paths" + echo " Manual verification recommended: cat $YAML_FILE" +fi +echo " Backup saved: $YAML_FILE.bak" + +echo "" +echo "==================================================" +echo "Synchronization completed!" +echo "==================================================" diff --git a/changelog.d/3-deploy-builds/pg_ha_cluster b/changelog.d/3-deploy-builds/pg_ha_cluster new file mode 100644 index 000000000..fccc06593 --- /dev/null +++ b/changelog.d/3-deploy-builds/pg_ha_cluster @@ -0,0 +1 @@ +Added: PostgreSQL high availability cluster with repmgr diff --git a/nix/pkgs/wire-binaries.nix b/nix/pkgs/wire-binaries.nix index c4900cc57..8946c56f4 100644 --- a/nix/pkgs/wire-binaries.nix +++ b/nix/pkgs/wire-binaries.nix @@ -18,6 +18,7 @@ let containerd_version = "1.7.22"; minio_version = "RELEASE.2023-07-07T07-13-57Z"; mc_version = "RELEASE.2023-10-24T05-18-28Z"; + repmgr_version = "5.5.0"; # Note: If you change a version, replace the checksum with zeros, run « nix-build --no-out-link -A pkgs.wire-binaries », it will complain and give you the right checksum, use that checksum in this file, run it again and it should build without complaining. @@ -141,6 +142,21 @@ let url = "https://apt.postgresql.org/pub/repos/apt/pool/main/p/psycopg2/python3-psycopg2_2.9.10-1.pgdg22.04+1_amd64.deb"; sha256 = "sha256:cc2f749e3af292a67e012edeb4aa5d284f57f2d66a9a09fe5b81e5ffda73cab4"; }; + repmgr = fetchurl rec { + passthru.url = url; + url = "http://apt.postgresql.org/pub/repos/apt/pool/main/r/repmgr/repmgr_${repmgr_version}+debpgdg-1.pgdg22.04+1_all.deb"; + sha256 = "sha256:20c280811e758106335df1eb9954b61aa552823d3129f1e38c488fbd5efe0567"; + }; + repmgr-common = fetchurl rec { + passthru.url = url; + url = "http://apt.postgresql.org/pub/repos/apt/pool/main/r/repmgr/repmgr-common_${repmgr_version}+debpgdg-1.pgdg22.04+1_all.deb"; + sha256 = "sha256:34c660c66a9710fd4f20a66cc932741d3399dbba7e7ae4b67468b3e18f65f61c"; + }; + postgresql-17-repmgr = fetchurl rec { + passthru.url = url; + url = "http://apt.postgresql.org/pub/repos/apt/pool/main/r/repmgr/postgresql-17-repmgr_${repmgr_version}+debpgdg-1.pgdg22.04+1_amd64.deb"; + sha256 = "sha256:520d6ed4d540a2bb9174ac8276f8cb686c0268c13cccb89b28a9cdbd12049df8"; + }; }; in runCommandNoCC "wire-binaries" @@ -149,4 +165,4 @@ runCommandNoCC "wire-binaries" } '' mkdir -p $out ${toString (lib.mapAttrsToList (k: v: "cp ${v} $out/${baseNameOf v.url}\n") srcs)} -'' +'' \ No newline at end of file diff --git a/offline/default-build/build.sh b/offline/default-build/build.sh index 50a42c4bc..0ccfb63d8 100755 --- a/offline/default-build/build.sh +++ b/offline/default-build/build.sh @@ -8,7 +8,7 @@ OUTPUT_DIR="$SCRIPT_DIR/output" # expected structure to be: /wire-server-deploy/offline/default-build/build.sh ROOT_DIR="${SCRIPT_DIR}/../../" -mkdir -p "${OUTPUT_DIR}"/containers-{helm,other,system,adminhost} "${OUTPUT_DIR}"/binaries "${OUTPUT_DIR}"/versions +mkdir -p "${OUTPUT_DIR}"/containers-{helm,other,system,adminhost} "${OUTPUT_DIR}"/binaries "${OUTPUT_DIR}"/versions # Define the output tar file OUTPUT_TAR="${OUTPUT_DIR}/assets.tgz" @@ -19,7 +19,7 @@ TASKS_DIR="${SCRIPT_DIR}/../tasks" #cp $SCRIPT_DIR/..//output/containers-helm.tar "${OUTPUT_DIR}"/ # one need to comment the tasks below for which one wants to optimize the build -# Any of the tasks can be skipped by commenting them out +# Any of the tasks can be skipped by commenting them out # however, mind the dependencies between them and how they are grouped # Processing helm charts @@ -29,7 +29,7 @@ TASKS_DIR="${SCRIPT_DIR}/../tasks" "${TASKS_DIR}"/proc_pull_charts.sh OUTPUT_DIR="${OUTPUT_DIR}" HELM_CHART_EXCLUDE_LIST="inbucket,wire-server-enterprise,coturn,postgresql" # pulling the charts from helm-charts repo, charts to be included are passed as arguments HELM_CHART_INCLUDE_LIST -"${TASKS_DIR}"/proc_pull_ext_charts.sh OUTPUT_DIR="${OUTPUT_DIR}" HELM_CHART_INCLUDE_LIST="postgresql-external" +# "${TASKS_DIR}"/proc_pull_ext_charts.sh OUTPUT_DIR="${OUTPUT_DIR}" HELM_CHART_INCLUDE_LIST="postgresql-external" # copy local copy of values from root directory to output directory cp -r "${ROOT_DIR}"/values "${OUTPUT_DIR}"/ @@ -41,8 +41,8 @@ cp -r "${ROOT_DIR}"/dashboards "${OUTPUT_DIR}"/ "${TASKS_DIR}"/pre_chart_process_0.sh "${OUTPUT_DIR}" # all extra pre chart processing tasks for this profile should come here -# pre_chart_process_1.sh -# pre_chart_process_2.sh +# pre_chart_process_1.sh +# pre_chart_process_2.sh # processing the charts # here we also filter the images post processing the helm charts diff --git a/offline/min-build/build.sh b/offline/min-build/build.sh index ff249ed96..4e9fb858e 100755 --- a/offline/min-build/build.sh +++ b/offline/min-build/build.sh @@ -19,7 +19,7 @@ TASKS_DIR="${SCRIPT_DIR}/../tasks" #cp $SCRIPT_DIR/..//output/containers-helm.tar "${OUTPUT_DIR}"/ # one need to comment the tasks below for which one wants to optimize the build -# Any of the tasks can be skipped by commenting them out +# Any of the tasks can be skipped by commenting them out # however, mind the dependencies between them and how they are grouped # Processing helm charts @@ -31,7 +31,7 @@ HELM_CHART_EXCLUDE_LIST="inbucket,wire-server-enterprise,k8ssandra-operator,k8ss "${TASKS_DIR}"/proc_pull_charts.sh OUTPUT_DIR="${OUTPUT_DIR}" HELM_CHART_EXCLUDE_LIST="${HELM_CHART_EXCLUDE_LIST}" # pulling the charts from helm-charts repo, charts to be included are passed as arguments HELM_CHART_INCLUDE_LIST -"${TASKS_DIR}"/proc_pull_ext_charts.sh OUTPUT_DIR="${OUTPUT_DIR}" HELM_CHART_INCLUDE_LIST="postgresql-external" +# "${TASKS_DIR}"/proc_pull_ext_charts.sh OUTPUT_DIR="${OUTPUT_DIR}" HELM_CHART_INCLUDE_LIST="postgresql-external" # copy local copy of values from root directory to output directory cp -r "${ROOT_DIR}"/values "${OUTPUT_DIR}"/ @@ -44,7 +44,7 @@ cp -r "${ROOT_DIR}"/values "${OUTPUT_DIR}"/ # all extra pre chart processing tasks for this profile should come here # pre_chart_process_1.sh -# pre_chart_process_2.sh +# pre_chart_process_2.sh # processing the charts # here we also filter the images post processing the helm charts diff --git a/offline/postgresql-cluster.md b/offline/postgresql-cluster.md index bb2f72542..225042f30 100644 --- a/offline/postgresql-cluster.md +++ b/offline/postgresql-cluster.md @@ -1,70 +1,159 @@ -# PostgreSQL Cluster Deployment +# PostgreSQL High Availability Cluster Deployment Guide ## Table of Contents -- [Overview](#overview) -- [Architecture](#architecture) +- [Architecture Overview](#architecture-overview) +- [Key Concepts](#key-concepts) +- [Minimum System Requirements](#minimum-system-requirements) +- [High Availability Features](#high-availability-features) - [Inventory Definition](#inventory-definition) -- [Running the Playbook](#running-the-playbook) -- [PostgreSQL Packages Installation Playbook](#postgresql-packages-installation-playbook) -- [Deployment Architecture](#deployment-architecture) -- [Monitoring and Verification](#monitoring-and-verification) +- [Installation Process](#installation-process) +- [Deployment Commands Reference](#deployment-commands-reference) +- [Monitoring Checks After Installation](#monitoring-checks-after-installation) +- [Configuration Options](#confi# Sync PostgreSQL password from K8s secret to secrets.yaml +./bin/sync-k8s-secret-to-wire-secrets.sh \ + wire-postgresql-external-secret \ + password \ + values/wire-server/secrets.yaml \ + .brig.secrets.pgPassword \ + .galley.secrets.pgPassword \ + .spar.secrets.pgPassword \ + .gundeck.secrets.pgPasswordon-options) +- [Node Recovery Operations](#node-recovery-operations) +- [How It Confirms a Reliable System](#how-it-confirms-a-reliable-system) +- [Kubernetes Integration](#kubernetes-integration) - [Wire Server Database Setup](#wire-server-database-setup) -- [Troubleshooting](#troubleshooting) -- [Best Practices](#best-practices) -- [Security Considerations](#security-considerations) -## Overview of PostgreSQL Cluster Deployment +## Architecture Overview -## Overview -The [`postgresql-deploy.yml`](../ansible/postgresql-deploy.yml) playbook is designed to deploy a highly available PostgreSQL cluster using streaming replication. The cluster consists of one primary (read-write) node and two replica (read-only) nodes, providing fault tolerance and read scaling capabilities. The deployment includes tasks for installing PostgreSQL packages, deploying the primary node, deploying replica nodes, verifying the deployment, and setting up the Wire server database and user. - -## Architecture - -### Cluster Topology -The PostgreSQL cluster implements a **Primary-Replica** architecture with **asynchronous streaming replication**: +**Primary-Replica HA Architecture** with intelligent split-brain protection and automatic failover: ``` -┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ -│ PostgreSQL1 │ │ PostgreSQL2 │ │ PostgreSQL3 │ -│ (Primary) │ │ (Replica) │ │ (Replica) │ -│ Read/Write │────│ Read-Only │ │ Read-Only │ -│ │ │ │ │ │ -└─────────────────┘ └─────────────────┘ └─────────────────┘ - │ │ │ - └───────────────────────┼───────────────────────┘ - │ - Streaming Replication +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ PostgreSQL1 │ │ PostgreSQL2 │ │ PostgreSQL3 │ +│ (Primary) │───▶│ (Replica) │ │ (Replica) │ +│ Read/Write │ │ Read-Only │ │ Read-Only │ +└─────────────┘ └─────────────┘ └─────────────┘ + │ │ │ + └───────────────────┼───────────────────┘ + │ + ┌─────────────────────────────┐ + │ Split-Brain Protection │ + │ & Automatic Failover │ + └─────────────────────────────┘ ``` -### Key Components +**Core Components:** +- **PostgreSQL 17**: Streaming replication with performance improvements +- **repmgr**: Cluster management and automatic failover orchestration +- **Split-Brain Detection**: Prevents data corruption scenarios +- **Event-Driven Recovery**: Automatic cluster state management + +## Key Concepts + +### Technology Stack +- **PostgreSQL 17**: Latest stable version with streaming replication ([docs](https://www.postgresql.org/docs/17/warm-standby.html)) +- **repmgr/repmgrd**: Cluster management and automatic failover ([docs](https://repmgr.org/)) +- **Split-Brain Detection**: Intelligent monitoring prevents data corruption +- **Wire Integration**: Pre-configured database setup +- **Offline Deployment**: For offline deployments, packages are installed from local URLs defined in [`ansible/inventory/offline/group_vars/postgresql/postgresql.yml`](ansible/inventory/offline/group_vars/postgresql/postgresql.yml), bypassing repositories. + +### Software Versions +- **PostgreSQL**: 17.5 (latest stable with enhanced replication features) +- **repmgr**: 5.5.0 (production-ready cluster management with advanced failover) ([docs](https://repmgr.org/docs/current/)) +- **Ubuntu/Debian**: 20.04+ / 11+ (tested platforms for production deployment) + +## Minimum System Requirements + +Based on the PostgreSQL configuration template, the deployment is optimized for resource-constrained environments: + +**Memory Requirements:** +- **RAM**: 1GB minimum per node (based on configuration tuning) + - `shared_buffers = 256MB` (25% of total RAM) + - `effective_cache_size = 512MB` (50% of total RAM estimate) + - `maintenance_work_mem = 64MB` + - `work_mem = 2MB` per connection (with `max_connections = 20`) + +**CPU Requirements:** +- **Cores**: 1 CPU core minimum + - `max_parallel_workers_per_gather = 0` (parallel queries disabled) + - `max_parallel_workers = 1` + - `max_worker_processes = 2` (minimum for repmgr operations) + +**Storage Requirements:** +- **Disk Space**: 50GB minimum per node + - `wal_keep_size = 2GB` (4% of disk) + - `max_slot_wal_keep_size = 3GB` (6% of disk) + - `max_wal_size = 1GB` (2% of disk) + - Additional space for PostgreSQL data directory and logs + +**Operating System Requirements:** +- **Linux Distribution**: Ubuntu/Debian (systemd-based) +- **Filesystem**: ext4/xfs (configured with `wal_sync_method = fdatasync`) +- **Package Management**: apt-based package installation + +**Network Requirements:** +- **PostgreSQL Port**: 5432 open between all cluster nodes + +**Note**: Configuration supports up to 20 concurrent connections. For production workloads with higher loads, scale up resources accordingly. + +**⚠️ Important**: Review and optimize the [PostgreSQL configuration template](../ansible/templates/postgresql/postgresql.conf.j2) based on your specific hardware, workload, and performance requirements before deployment. + +## High Availability Features +- **Detection**: repmgrd monitors primary connectivity with configurable timeouts ([repmgr failover](https://repmgr.org/docs/current/failover.html)) +- **Failover Validation**: Quorum-based promotion with lag checking and connectivity validation +- **Promotion**: Promotes replica with most recent data automatically +- **Rewiring**: Remaining replicas connect to new primary automatically + +**Failover Validation Features:** +- **Quorum Requirements**: For 3+ node clusters, requires ≥2 visible nodes for promotion +- **Lag Validation**: Checks WAL replay lag against configurable threshold (64MB default) +- **Recovery State**: Ensures candidate is in recovery mode before promotion +- **Connectivity Checks**: Validates WAL receiver activity + +### 🛡️ Split-Brain Protection + +**Detection Logic:** +1. **Self-Check**: Am I an isolated primary? (no active replicas connected) +2. **Cross-Node Verification**: Query all other cluster nodes to detect conflicting primaries +3. **Conflict Resolution**: If split-brain detected → mask and stop PostgreSQL service + +**Advanced Features:** +- **Multi-Node Checking**: Verifies primary status across all cluster nodes +- **Graceful Shutdown**: Masks service to prevent restart attempts, then stops PostgreSQL +- **Force Termination**: Uses `systemctl kill` if normal stop fails +- **Event Logging**: Comprehensive logging to syslog and journal -1. **Primary Node (postgresql1)**: - - Handles all write operations and read queries - - Sends WAL (Write-Ahead Log) records to replicas - - Manages replication slots for each replica - - Configured with `wal_level = replica` +**Recovery:** Event-driven fence script updates node status in the repmgr database and automatically unmasks services during successful rejoins (manual unmasking required for split-brain resolution) -2. **Replica Nodes (postgresql2, postgresql3)**: - - Receive and apply WAL records from primary - - Can handle read-only queries (hot standby) - - Use physical replication slots for connection management - - Automatically reconnect to primary if connection is lost +### 🔄 Self-Healing Capabilities -3. **Replication Mechanism**: - - **Streaming Replication**: Real-time transmission of WAL records - - **Asynchronous Mode**: Optimized for performance over strict consistency - - **Physical Replication Slots**: Ensure WAL retention for disconnected replicas - - **Hot Standby**: Replicas accept read-only queries during replication +| Scenario | Detection | Recovery Time | Data Loss | +|----------|-----------|---------------|-----------| +| Primary Failure | 25-60 seconds | < 30 seconds | None | +| Network Partition | 30-120 seconds | Automatic | None | +| Node Recovery | Immediate | < 2 minutes | None | -### High Availability Features +**Primary Failure**: repmgrd monitors connectivity (2s intervals), confirms failure after 5 attempts (~10s), validates quorum (≥2 nodes for 3+ clusters), selects best replica by priority/lag, promotes automatically with zero data loss. -- **Automatic Failover**: Manual promotion of replica to primary when needed -- **WAL Retention**: Primary retains WAL data for replica recovery -- **Connection Management**: Replicas automatically reconnect after network issues -- **Read Load Distribution**: Read queries can be distributed across replicas +**Network Partition**: 30s timer triggers cross-node verification, isolates conflicting primaries by masking/stopping services, auto-recovers when network restores with timeline synchronization if needed. + +**Node Recovery**: Auto-starts in standby mode, connects to current primary, uses pg_rewind for timeline divergence, registers with repmgr, catches up via WAL streaming within 2 minutes. + +### 📊 Monitoring & Event System + +**Automated split-brain detection** runs every 30 seconds via systemd timer, with cross-node verification to prevent data corruption. Event-driven fence scripts handle service masking/unmasking during cluster state changes. + +**Key monitoring commands:** +- Cluster status: `sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf cluster show` +- Service status: `sudo systemctl status postgresql@17-main repmgrd@17-main detect-rogue-primary.timer` +- Replication status: `sudo -u postgres psql -c "SELECT application_name, client_addr, state FROM pg_stat_replication;"` +- Logs: `sudo journalctl -u detect-rogue-primary.service --since "10m ago"` ## Inventory Definition -The PostgreSQL [inventory](../ansible/inventory/offline/99-static) is structured as follows: + +The PostgreSQL cluster requires a properly structured inventory to define node roles and configuration. The inventory file should be located at `ansible/inventory/offline/hosts.ini` or your specific environment path. + +### Inventory Structure ```ini [all] @@ -74,449 +163,474 @@ postgresql3 ansible_host=192.168.122.206 [postgresql:vars] postgresql_network_interface = enp1s0 +postgresql_version = 17 wire_dbname = wire-server wire_user = wire-server -# if not defined, a random password will be generated -# wire_pass = verysecurepassword +# Optional: wire_pass = verysecurepassword (if not defined, auto-generated) -# Add all postgresql nodes here +# All PostgreSQL nodes [postgresql] postgresql1 postgresql2 postgresql3 -# Add all postgresql primary nodes here + +# Primary (read-write) node [postgresql_rw] postgresql1 -# Add all postgresql read-only nodes here i.e. replicas + +# Replica (read-only) nodes [postgresql_ro] postgresql2 postgresql3 - ``` -#### Node Groups: +### Node Groups Explained + +| Group | Purpose | Nodes | Role | +|-------|---------|-------|------| +| `postgresql` | All PostgreSQL nodes | postgresql1-3 | Base configuration | +| `postgresql_rw` | Primary nodes | postgresql1 | Read/Write operations | +| `postgresql_ro` | Replica nodes | postgresql2-3 | Read-only operations | -- `postgresql`: Group containing all PostgreSQL nodes. -- `postgresql_rw`: Group containing the primary (read-write) PostgreSQL node. -- `postgresql_ro`: Group containing the replica (read-only) PostgreSQL nodes. +### Configuration Variables -#### Variables: +| Variable | Default | Description | Required | +|----------|---------|-------------|----------| +| `postgresql_network_interface` | `enp1s0` | Network interface for cluster communication | No | +| `postgresql_version` | `17` | PostgreSQL major version | No | +| `wire_dbname` | `wire-server` | Database name for Wire application | Yes | +| `wire_user` | `wire-server` | Database user for Wire application | Yes | +| `wire_pass` | auto-generated | Password (displayed as output of the ansible task) | No | -- `postgresql_network_interface`: Network interface for PostgreSQL nodes (optional, defaults to `enp1s0`). -- `wire_dbname`: Name of the Wire server database. -- `wire_user`: User for the Wire server database. -- `wire_pass`: Password for the wire server, if not defined, a random password will be generated. Password will be displayed on the output once the playbook has finished creating the user. Use this password to configure wire-server helm charts. -### Running the Playbook +## Installation Process -To run the [`postgresql-deploy.yml`](../ansible/postgresql-deploy.yml) playbook, use the following command: +### 🚀 Complete Installation (Fresh Deployment) + +#### **Step 1: Verify Connectivity** ```bash -ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-deploy.yml +# Test Ansible connectivity to all nodes +ansible all -i ansible/inventory/offline/hosts.ini -m ping ``` -**Note**: The ansible commands should be run using the WSD_CONTAINER container as explained in the [Making tooling available in your environment](./docs_ubuntu_22.04.md#making-tooling-available-in-your-environment) documentation. +#### **Step 2: Full Cluster Deployment** +See the [Deployment Commands Reference](#deployment-commands-reference) section for all available deployment commands. -#### Tags and Selective Execution +**⏱️ Expected Duration: 10-15 minutes** -The playbook uses tags to allow selective execution of specific components. You can run only specific parts of the deployment by using the `--tags` or `--skip-tags` options: +A complete deployment performs: +1. ✅ **Package Installation**: PostgreSQL 17 + repmgr + dependencies +2. ✅ **Primary Setup**: Configure primary node with repmgr database +3. ✅ **Replica Deployment**: Clone and configure replica nodes +4. ✅ **Verification**: Health checks and replication status +5. ✅ **Wire Integration**: Create Wire database and user +6. ✅ **Monitoring**: Deploy split-brain detection system -**Tag Reference Table:** +#### **Step 3: Verify Installation** +See the [Monitoring Checks](#monitoring-checks-after-installation) section for comprehensive verification procedures. -| Component | Tag | Description | -|-----------|-----|-------------| -| Package Installation | `install` | Installs PostgreSQL packages and dependencies | -| Primary Node | `primary` | Deploys and configures the primary PostgreSQL node | -| Replica Nodes | `replica` | Deploys and configures replica PostgreSQL nodes | -| Verification | `verify` | Verifies cluster health and replication status | -| Wire Setup | `wire-setup` | Creates Wire database and user account | -| All Components | `postgresql` | Runs all PostgreSQL deployment tasks | +## Deployment Commands Reference -**Example usage with tags**: +### 🎯 Main Commands ```bash -# Install packages only -ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-deploy.yml --tags "install" - -# Deploy only primary node -ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-deploy.yml --tags "primary" - -# Deploy primary and replicas, skip - wire setup, install and verify -ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-deploy.yml --tags "primary,replica" - -# Skip installation (if PostgreSQL is already installed) -ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-deploy.yml --skip-tags "install" +# Complete fresh deployment +ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-deploy.yml -# Skip wire setup and verification -ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-deploy.yml --skip-tags "wire-setup,verify" +# Clean previous deployment +# Only cleans the messy configurations the data remains intact +ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-deploy.yml --tag cleanup -# Run only verification -ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-deploy.yml --tags "verify" +# Deploy without the cleanup process +ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-deploy.yml --skip-tags "cleanup" ``` -## PostgreSQL Packages Installation Playbook +### 🏷️ Tag-Based Deployments -### Overview -This playbook installs PostgreSQL packages and their dependencies on hosts belonging to the `postgresql` group. The installation supports both online repository-based installation and offline package deployment for air-gapped environments. +| Tag | Description | Example | +|-----|-------------|---------| +| `cleanup` | Clean previous deployment state | `--tags "cleanup"` | +| `install` | Install PostgreSQL packages only | `--tags "install"` | +| `primary` | Deploy primary node only | `--tags "primary"` | +| `replica` | Deploy replica nodes only | `--tags "replica"` | +| `verify` | Verify HA setup only | `--tags "verify"` | +| `wire-setup` | Wire database setup only | `--tags "wire-setup"` | +| `monitoring` | Deploy cluster monitoring only | `--tags "monitoring"` | -### Installation Architecture +## Monitoring Checks After Installation -The package installation follows a layered approach: +### 🛡️ Key Verification Commands -``` -┌─────────────────────────────────────────────────────────────────────┐ -│ Package Dependencies │ -├─────────────────────────────────────────────────────────────────────┤ -│ System Dependencies: libssl-dev, libllvm15, sysstat, ssl-cert │ -├─────────────────────────────────────────────────────────────────────┤ -│ PostgreSQL Core: libpq5, postgresql-common, postgresql-client │ -├─────────────────────────────────────────────────────────────────────┤ -│ PostgreSQL Server: postgresql-17, postgresql-client-17 │ -├─────────────────────────────────────────────────────────────────────┤ -│ Python Integration: python3-psycopg2 │ -└─────────────────────────────────────────────────────────────────────┘ -``` +```bash +# 1. Cluster status (primary command) +sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf cluster show -### Variables +# 2. Service status +sudo systemctl status postgresql@17-main repmgrd@17-main detect-rogue-primary.timer -| Variable | Description | -|------------------------------|-----------------------------------------------------------------------------| -| `postgresql_version` | Version of PostgreSQL to install (e.g., 17). | -| `postgresql_data_dir` | Directory where PostgreSQL data will be stored. | -| `postgresql_conf_dir` | Directory where PostgreSQL configuration files will be stored. | -| `repmgr_user` | User for repmgr (PostgreSQL replication manager). | -| `repmgr_password` | Password for the repmgr user. | -| `repmgr_database` | Database name for repmgr. | -| `postgresql_use_repository` | Boolean to install packages from the repository (`true`) or from URLs (`false`). Default is `false`. | -| `postgresql_pkgs` | List of dictionaries containing details about PostgreSQL packages to download and install. Each dictionary includes `name`, `url`, and `checksum`. | +# 3. Replication status (run on primary) +sudo -u postgres psql -c "SELECT application_name, client_addr, state FROM pg_stat_replication;" -### PostgreSQL Packages +# 4. Check split-brain detector logs +sudo journalctl -u detect-rogue-primary.service --since "10m ago" -The following packages are required for a complete PostgreSQL installation when not using an online repository: +# 5. Check fence events +sudo tail -n 20 -f /var/log/postgresql/fence_events.log -1. **libpq5**: PostgreSQL C client library. -2. **postgresql-client-common**: Common files for PostgreSQL client applications. -3. **postgresql-common-dev**: Development files for PostgreSQL common components. -4. **postgresql-common**: Common scripts and files for PostgreSQL server and client packages. -5. **postgresql-client-17**: Client applications for PostgreSQL version 17. -6. **postgresql-17**: Main PostgreSQL server package for version 17. -7. **python3-psycopg2**: PostgreSQL adapter for Python. +# 6. Manual promotion (rare emergency case) +sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf standby promote +``` -### Offline Package Management +## How It Confirms a Reliable System -When not using the online repository (`postgresql_use_repository = false`), packages will be downloaded from the `assethost` setup. Ensure the offline sources are configured by running: +### 🛡️ Reliability Features +- **Split-Brain Prevention**: 30-second monitoring with automatic protection +- **Automatic Failover**: < 30 seconds detection and promotion +- **Data Consistency**: Streaming replication with timeline management +- **Self-Healing**: Event-driven recovery and service management +### 🎯 Quick Health Check ```bash -ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/setup-offline-sources.yml --limit assethost,postgresql +sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf cluster show +sudo systemctl status detect-rogue-primary.timer +sudo -u postgres psql -c "SELECT * FROM pg_stat_replication;" ``` -**Note**: If the above command has already been executed with the latest wire-server-deploy artifacts, avoid running it again. However, if PostgreSQL is being updated or installed for the first time, it is recommended to run this command to ensure all required packages are available from the latest wire-server-deploy artifacts. +**Expected**: One primary "* running", all replicas "running", timer "active (waiting)" + +## Configuration Options + +### 🔧 repmgr Configuration +- **Node Priority**: `priority` Determines promotion order during failover (higher values preferred) +- **Monitoring Interval**: `monitor_interval_secs` (default: 2 seconds) +- **Reconnect Settings**: `reconnect_attempts` (default: 5), `reconnect_interval` (default: 5 seconds) + +*Configuration file: [`ansible/inventory/offline/group_vars/postgresql/postgresql.yml`](../ansible/inventory/offline/group_vars/postgresql/postgresql.yml)* + +**Node Configuration:** +```yaml +repmgr_node_config: + postgresql1: # Primary node + node_id: 1 + priority: 150 + role: primary + postgresql2: # First standby + node_id: 2 + priority: 100 + role: standby + postgresql3: # Second standby + node_id: 3 + priority: 50 + role: standby +``` -### Tasks +*See [repmgr configuration reference](https://repmgr.org/docs/current/configuration-file.html) for complete options.* -The installation process follows a systematic approach ensuring all dependencies are met: +### 🛡️ Failover Validation +- **Quorum**: Minimum 2 visible nodes for 3+ node clusters +- **Lag Threshold**: `LAG_CAP` environment variable (default: 64MB) +- **Connectivity**: WAL receiver activity validation -1. **Install PostgreSQL dependencies**: - - **System Libraries**: Installs core dependencies for PostgreSQL operation - - `libssl-dev`: SSL/TLS support for secure connections - - `libllvm15`: Required for JIT compilation support - - `sysstat`: System performance monitoring tools - - `ssl-cert`: SSL certificate management utilities - - `libjson-perl`, `libipc-run-perl`: Perl libraries for PostgreSQL utilities +## Node Recovery Operations -2. **Repository-based Installation** (when `postgresql_use_repository = true`): - - **Package Selection**: Installs packages from PostgreSQL official repository - - `postgresql-{{ postgresql_version }}`: Main server package - - `postgresql-client-{{ postgresql_version }}`: Client tools and libraries - - `python3-psycopg2`: Python database adapter for Ansible modules +### 🔄 Standard Node Rejoin + +```bash +# Compatible data rejoin +sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf node rejoin -d repmgr -h -U repmgr --verbose -3. **Offline Package Management** (when `postgresql_use_repository = false`): - - **Version Verification**: Checks if packages are already installed to avoid conflicts - - **Package Download**: Downloads `.deb` files from specified URLs with checksum verification - - **Local Installation**: Installs packages using `dpkg` for air-gapped environments - - **Cleanup Process**: Removes downloaded files to conserve disk space +# Timeline divergence rejoin +sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf node rejoin -d repmgr -h -U repmgr --force-rewind --verbose +``` -4. **Package Integrity**: - - **Checksum Validation**: Ensures package integrity during download - - **Dependency Resolution**: Handles package dependencies automatically - - **Installation Verification**: Confirms successful installation of all components +### 🚨 Emergency Recovery -### Usage -To run the [`postgresql-install.yml`](../ansible/postgresql-playbooks/postgresql-install.yml) playbook independently, use the following command: +Usually the recovery time is very fast on postgres cluster level (30 seconds to a minute) but for the application it might take from 1 minute to 2 minutes. The reason is postgres-endpoint-manager cronjob runs every 2 minutes to check and update the postgresql endpoints if necessary. -```bash -ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-playbooks/postgresql-install.yml -``` +**Complete Cluster Failure (All Nodes Down):** -Alternatively, you can run just the installation step from the main playbook using tags: +When all PostgreSQL nodes fail simultaneously (power outage, network failure, etc.), follow this recovery procedure: +**Step 1: Identify the Most Recent Primary** +On each node, check the data consistency and timeline: ```bash -ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-deploy.yml --tags "install" -``` +# Check control data on each node +sudo -u postgres /usr/lib/postgresql/17/bin/pg_controldata /var/lib/postgresql/17/main | grep -E "Latest checkpoint location|TimeLineID|Time of latest checkpoint|Database system identifier" -## Deployment Architecture +# Compare LSN (Log Sequence Number) - highest LSN has most recent data +``` -### Primary Node Deployment Process +**Step 2: Start the Most Recent Primary** +Choose the node with the highest LSN/most recent checkpoint: +```bash +# Start PostgreSQL service on the chosen node +sudo systemctl start postgresql@17-main -The primary node deployment is handled by the [`postgresql-deploy-primary.yml`](../ansible/postgresql-playbooks/postgresql-deploy-primary.yml) playbook, which performs the following key operations: +# Register as new primary (removes old cluster metadata) +sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf primary register --force -#### 1. Pre-deployment Checks -- **Replication User Verification**: Checks if the replication user (`repmgr_user`) already exists -- **Replication Slots Check**: Verifies existing replication slots for replica nodes -- **Service Status**: Ensures PostgreSQL service is ready for configuration - -#### 2. Configuration Management -- **pg_hba.conf Configuration**: Sets up authentication rules for: - - Local connections using peer authentication - - Replication connections from replica nodes - - Inter-cluster communication -- **Primary PostgreSQL Configuration**: Applies optimized settings via [postgresql_primary.conf.j2](../ansible/templates/postgresql_primary.conf.j2). - -#### 3. Replication Setup -- **Replication User Creation**: Creates the replication user with `REPLICATION,LOGIN` privileges -- **Physical Replication Slots**: Creates dedicated slots for each replica (`postgresql2`, `postgresql3`) -- **Service Management**: Restarts and enables PostgreSQL service - -#### 4. Readiness Verification -- **Port Availability**: Waits for PostgreSQL to accept connections on port 5432 - -### Replica Node Deployment Process - -The replica deployment is managed by the [`postgresql-deploy-replica.yml`](../ansible/postgresql-playbooks/postgresql-deploy-replica.yml) playbook with the following workflow: - -#### 1. Replica State Assessment -- **Configuration Check**: Verifies if replica is already configured (`standby.signal` file presence) -- **Service Status**: Checks current PostgreSQL service state -- **Data Directory**: Assesses existing data directory state - -#### 2. Configuration Deployment -- **Authentication Setup**: Configures `pg_hba.conf` for replica-specific rules -- **Replica Configuration**: Applies [`postgresql_replica.conf.j2`](../ansible/templates/postgresql_replica.conf.j2) with: - ``` - primary_conninfo = 'host= user= ...' - primary_slot_name = '' - hot_standby = on - max_standby_streaming_delay = 120s - ``` - -#### 3. Base Backup Process -For unconfigured replicas, the playbook performs: -- **Service Shutdown**: Stops PostgreSQL service safely -- **Data Directory Cleanup**: Removes existing data to prevent conflicts -- **pg_basebackup Execution**: Creates replica from primary using: - ```bash - pg_basebackup -h -U -D -P -R -X stream - ``` -- **Standby Signal**: Creates `standby.signal` file to mark as replica - -#### 4. Replica Activation -- **Service Startup**: Starts PostgreSQL in hot standby mode -- **Connection Verification**: Ensures replica connects to primary successfully -- **Replication PostgreSQL service Status**: Waits for PostgreSQL to accept connections on port 5432 - -### Security Configuration - -#### Authentication Matrix -The [`pg_hba.conf`](../ansible/templates/pg_hba.conf.j2) template implements a security model with: - -| Connection Type | User | Source | Method | Purpose | -|----------------|------|--------|---------|---------| -| Local | All | Unix Socket | peer | Local admin access | -| Host | All | 127.0.0.1/32 | md5 | Local TCP connections | -| Host | repmgr_user | replica_nodes | md5 | Streaming replication | -| Host | All | primary_network | md5 | Inter-cluster communication | - -#### Network Security -- **Restricted Access**: Only defined IP addresses can connect -- **Encrypted Connections**: MD5 authentication for network connections -- **Replication Isolation**: Dedicated user for replication traffic - -### Performance Optimization - -#### Resource-Constrained Configuration -The deployment is optimized for environments with limited resources (1GB RAM, 1 core, 50GB disk): - -**Memory Settings:** -- `shared_buffers = 128MB` (~12.5% of RAM) -- `effective_cache_size = 512MB` (~50% of RAM) -- `work_mem = 2MB` (conservative for limited memory) -- `maintenance_work_mem = 32MB` - -**WAL Management:** -- `wal_keep_size = 2GB` (4% of disk space) -- `max_slot_wal_keep_size = 3GB` (6% of disk space) -- `wal_writer_delay = 200ms` (optimized for single core) - -**Replication Tuning:** -- Asynchronous replication for performance -- Physical replication slots for reliability -- Optimized timeouts for resource constraints - -## Monitoring and Verification - -### Automated Verification Process - -The [`postgresql-verify-HA.yml`](../ansible/postgresql-playbooks/postgresql-verify-HA.yml) playbook provides comprehensive health checks: - -#### 1. Streaming Replication Status -Monitors real-time replication metrics: -```sql -SELECT - client_addr, - application_name, - state, - sync_state, - pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn)) as lag_size, - CASE - WHEN pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) = 0 THEN 'SYNCHRONIZED' - WHEN pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) < 1024*1024 THEN 'NEAR_SYNC' - ELSE 'LAGGING' - END as status -FROM pg_stat_replication; +# Start repmgrd daemon and split-brain detection +sudo systemctl start repmgrd@17-main +sudo systemctl start detect-rogue-primary.timer ``` -#### 2. Replication Slot Health -Validates slot availability and lag: -```sql -SELECT - slot_name, - active, - pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)) as slot_lag, - CASE - WHEN active THEN 'ACTIVE' - ELSE 'INACTIVE - CHECK REPLICA' - END as slot_status -FROM pg_replication_slots; -``` +**Step 3: Rejoin Other Nodes as Standby** +For each remaining node: +```bash +# Start PostgreSQL service +sudo systemctl start postgresql@17-main -### Manual Health Checks +# Force rejoin as standby (handles timeline divergence) +sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf node rejoin -d repmgr -h -U repmgr --force-rewind --verbose + +# Start repmgrd daemon and split-brain detection after successful rejoin +sudo systemctl start repmgrd@17-main +sudo systemctl start detect-rogue-primary.timer +``` -#### Primary Node Status +**Step 4: Verify Cluster Recovery** ```bash -# Check replication status -sudo -u postgres psql -c "SELECT * FROM pg_stat_replication;" +# Check cluster status +sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf cluster show -# Verify replication slots -sudo -u postgres psql -c "SELECT * FROM pg_replication_slots;" +# Verify replication +sudo -u postgres psql -c "SELECT application_name, client_addr, state FROM pg_stat_replication;" -# Check WAL sender processes -ps aux | grep "walsender" +# Check all services are running +sudo systemctl status postgresql@17-main repmgrd@17-main detect-rogue-primary.timer ``` -#### Replica Node Status from replica nodes +**⚠️ Important Notes:** +- **Data Loss Risk**: If nodes have divergent data, some transactions may be lost +- **Timeline Handling**: `--force-rewind` automatically handles timeline divergence +- **Service Order**: Always start PostgreSQL before attempting repmgr operations +- **Backup Recovery**: If all nodes are corrupted, restore from backup before following this procedure + +**Expected Recovery Time**: 5-15 minutes depending on data size and number of nodes + +**Bring back the old primary as standby (Split-Brain Resolution):** +- Get the current primary node ip with `sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf cluster show` on a active node. +- `ssh` into the old primary +- Unmask service and rejoin the cluster as standby with this command: `sudo systemctl unmask postgresql@17-main.service && sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf node rejoin -d repmgr -h -U repmgr --force-rewind --verbose` +- Service auto-starts in standby mode and will start following the new primary when the rejoin succeeds and if it fails the node might join the cluster as standalone standby. +- Check the cluster status `sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf cluster show` to make sure the node joins the cluster properly and the upstream is the new primary. +- If the upstream of the re-joined node is empty that means the re-join failed partially, please rerun the above procedure by +- masking and stopping postgresql first: `sudo systemctl mask postgresql@17-main && sudo systemctl stop postgresql@17-main` +- Run the unmask and rejoin command. That should be it. + +### 🔧 OS Upgrades and Maintenance Operations + +**Behavior During OS Upgrades**: PostgreSQL HA cluster handles OS-level maintenance (firmware updates, kernel upgrades, reboots) gracefully with automatic failover and recovery. + +#### **Planned Maintenance (Single Node)** +1. **Pre-Reboot**: + - **For major OS updates**: Disable repmgrd and split-brain detection to prevent conflicts: + ```bash + sudo systemctl stop repmgrd@17-main && sudo systemctl disable repmgrd@17-main + sudo systemctl stop detect-rogue-primary.timer && sudo systemctl disable detect-rogue-primary.timer + ``` + - **For routine reboots**: No manual intervention required, repmgr automatically detects node unavailability +2. **During Reboot**: + - If **replica node**: Cluster continues normally with remaining nodes + - If **primary node**: Automatic failover occurs (~10-30s), promotes best replica +3. **Post-Reboot**: + - **After major OS updates**: Manually rejoin cluster in standby mode: + ```bash + # Start PostgreSQL service + sudo systemctl start postgresql@17-main + # Manually rejoin as standby + sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf node rejoin -d repmgr -h -U repmgr --verbose + # Re-enable services after successful rejoin + sudo systemctl enable repmgrd@17-main && sudo systemctl start repmgrd@17-main + sudo systemctl enable detect-rogue-primary.timer && sudo systemctl start detect-rogue-primary.timer + ``` + - **After routine reboots**: Node automatically rejoins as standby, catches up via WAL streaming +4. **Service Status**: PostgreSQL and repmgrd services auto-start via systemd (enabled by default for routine maintenance) + +#### **Rolling Upgrades (Multiple Nodes)** +**Recommended Sequence for Major OS Updates**: +1. **Disable repmgrd and split-brain detection on all nodes**: + ```bash + sudo systemctl stop repmgrd@17-main && sudo systemctl disable repmgrd@17-main + sudo systemctl stop detect-rogue-primary.timer && sudo systemctl disable detect-rogue-primary.timer + ``` +2. Upgrade replica nodes first (postgresql2, postgresql3) +3. Manually rejoin each replica as standby after upgrade +4. Upgrade primary node last (postgresql1) - automatic failover will occur +5. Manually rejoin former primary as standby +6. **Re-enable all services on all nodes**: + ```bash + sudo systemctl enable repmgrd@17-main && sudo systemctl start repmgrd@17-main + sudo systemctl enable detect-rogue-primary.timer && sudo systemctl start detect-rogue-primary.timer + ``` +7. Monitor cluster status between each step: `sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf cluster show` + +#### **Manual Verification Steps** +After each node reboot, verify: ```bash -# Check replica status -sudo -u postgres psql -c "SELECT * FROM pg_stat_wal_receiver;" +# Check cluster status +sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf cluster show -# Verify hot standby mode -sudo -u postgres psql -c "SELECT pg_is_in_recovery();" +# Verify services are running +sudo systemctl status postgresql@17-main repmgrd@17-main detect-rogue-primary.timer -# Check replication lag -sudo -u postgres psql -c "SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()));" +# Check replication status (on current primary) +sudo -u postgres psql -c "SELECT application_name, client_addr, state FROM pg_stat_replication;" ``` -### Performance Metrics # TODO +#### **Troubleshooting Failed Auto-Recovery** +If a node doesn't rejoin automatically after reboot: + +**For Major OS Updates (repmgrd and split-brain detection were disabled):** +1. **Start PostgreSQL service**: `sudo systemctl start postgresql@17-main` +2. **Manual rejoin as standby**: + ```bash + sudo systemctl unmask postgresql@17-main.service && sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf node rejoin -d repmgr -h -U repmgr --verbose + ``` +3. **Re-enable all services**: + ```bash + sudo systemctl enable repmgrd@17-main && sudo systemctl start repmgrd@17-main + sudo systemctl enable detect-rogue-primary.timer && sudo systemctl start detect-rogue-primary.timer + ``` +4. **Check logs**: `sudo journalctl -u postgresql@17-main -u repmgrd@17-main --since "10m ago"` + +**For Routine Reboots (automatic recovery expected):** +1. **Check service status**: `sudo systemctl status postgresql@17-main repmgrd@17-main detect-rogue-primary.timer` +2. **Manual start if needed**: `sudo systemctl start postgresql@17-main repmgrd@17-main detect-rogue-primary.timer` +3. **Force rejoin if timeline diverged**: + ```bash + sudo -u postgres repmgr -f /etc/repmgr/17-main/repmgr.conf node rejoin -d repmgr -h -U repmgr --force-rewind --verbose + ``` +4. **Check logs**: `sudo journalctl -u postgresql@17-main -u repmgrd@17-main --since "10m ago"` + +#### **Client Application Impact** +- **During failover**: Brief connection interruption (10-30s), applications should implement retry logic +- **Kubernetes environments**: postgres-endpoint-manager updates service endpoints within 2 minutes +- **Multiple primaries**: If multiple primaries are detected by the postgres-endpoint-manager, it will skip the endpoint update unless it gets resolved in the postgres cluster and will keep the last know good state. Check the cronjob pods log for details. + +**Best Practice**: Schedule maintenance during low-traffic periods and monitor cluster health throughout the process. + +**⚠️ Critical Note**: The split-brain detection timer (`detect-rogue-primary.timer`) runs independently of `repmgrd` and will continue to mask PostgreSQL services if it detects split-brain scenarios. Always disable it during major OS updates to prevent conflicts with manual cluster management. + +## Wire Server Database Setup -#### Key Performance Indicators -1. **Replication Lag**: Should be < 1MB under normal load -2. **Connection Count**: Monitor active connections vs. max_connections -3. **WAL Generation Rate**: Track WAL file creation frequency -4. **Disk Usage**: Monitor WAL directory and data directory sizes +The [`postgresql-wire-setup.yml`](../ansible/postgresql-playbooks/postgresql-wire-setup.yml) playbook creates the Wire server database and user account with **automatic Kubernetes secret management** - eliminating manual password handling. -#### Health Thresholds -- **Replication Lag**: Alert if > 5MB -- **Connection Usage**: Alert if > 80% of max_connections -- **Disk Usage**: Alert if WAL directory > 10% of total disk -- **Recovery Time**: Replica restart should complete within 2 minutes +### 🔐 Kubernetes Secret-Based Password Management -## Wire Server Database Setup +**How It Works:** +1. ✅ **Checks for existing K8s secret** `wire-postgresql-external-secret` in the cluster +2. ✅ **If exists**: Retrieves password from secret and uses it +3. ✅ **If not exists**: Generates strong 32-character random password and creates secret +4. ✅ **Creates/updates PostgreSQL user** with the password +5. ✅ **Stores credentials** in Kubernetes for wire-server to use -### PostgreSQL Wire Setup Playbook -The [`postgresql-wire-setup.yml`](../ansible/postgresql-playbooks/postgresql-wire-setup.yml) playbook is the final step in the PostgreSQL cluster deployment process. This playbook creates the dedicated database and user account required for Wire server operation. +### 📋 Running the Setup Playbook -#### Overview -This playbook runs exclusively on the primary PostgreSQL node (`postgresql_rw` group) and performs the following operations: +```bash +# Run the wire-server database setup +ansible-playbook ansible/postgresql-playbooks/postgresql-wire-setup.yml \ + -i ansible/inventory/offline/99-static +``` + +### 🔧 Using Password in Wire-Server Configuration + +The deployment pipeline automatically manages PostgreSQL password synchronization between the Kubernetes secret and wire-server configuration. + +#### **Automated Password Synchronization (CI/CD Pipeline)** -1. **Database Management**: - - Checks if the Wire server database `wire_dbname` already exists - - Creates the database if it doesn't exist +The CI/CD pipeline ([bin/offline-deploy.sh](../bin/offline-deploy.sh)) automatically handles password synchronization: -2. **User Account Management**: - - Verifies if the Wire server user account exists - - Creates a new user account if needed - - Generates a secure random password if `wire_pass` is not defined +1. **PostgreSQL Setup**: `postgresql-wire-setup.yml` creates/retrieves the K8s secret `wire-postgresql-external-secret` +2. **Password Sync**: `sync-k8s-secret-to-wire-secrets.sh` updates `values/wire-server/secrets.yaml` with the actual password +3. **Helm Deployment**: `offline-helm.sh` deploys wire-server using the updated `secrets.yaml` file -3. **Credential Management**: - - Displays generated credentials for the `wire_user` - - Ensures secure password generation (15 characters, alphanumeric) +**Key Script:** +- [`bin/sync-k8s-secret-to-wire-secrets.sh`](../bin/sync-k8s-secret-to-wire-secrets.sh) - Generic script to synchronize any K8s secret to YAML files -#### Usage -This playbook is automatically executed as part of the main `postgresql-deploy.yml` workflow, but can be run independently: +**Benefits:** +- ✅ No manual password management required +- ✅ Passwords are automatically generated (32-char random string) +- ✅ Source of truth is the Kubernetes secret +- ✅ Automatic backup before password updates +- ✅ Generic design supports any secret/YAML combination + +#### **Manual Password Synchronization** + +For manual deployments or troubleshooting, use the generic sync script within the docker container of the adminhost: ```bash -ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-playbooks/postgresql-wire-setup.yml +For manual deployments or troubleshooting, use the generic sync script: + +```bash +# Sync PostgreSQL password from K8s secret to secrets.yaml +./bin/sync-k8s-secret-to-wire-secrets.sh \ + wire-postgresql-external-secret \ + password \ + values/wire-server/secrets.yaml \ + .brig.secrets.pgPassword \ + .galley.secrets.pgPassword ``` -Alternatively, you can run just the wire setup from the main playbook using tags: +This script: +- Retrieves password from `wire-postgresql-external-secret` +- Updates multiple YAML paths in one command +- Creates a backup at `secrets.yaml.bak` +- Verifies all updates succeeded +- Works with any Kubernetes secret and YAML file + +#### **Alternative: Manual Password Override** + +For quick deployments or testing, override passwords during helm installation: ```bash -ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-deploy.yml --tags "wire-setup" +# Retrieve password from Kubernetes secret +PG_PASSWORD=$(kubectl get secret wire-postgresql-external-secret \ + -n default \ + -o jsonpath='{.data.password}' | base64 --decode) + +# Install/upgrade with password override +helm upgrade --install wire-server ./charts/wire-server \ + --namespace default \ + -f values/wire-server/values.yaml \ + -f values/wire-server/secrets.yaml \ + --set brig.secrets.pgPassword="${PG_PASSWORD}" \ + --set galley.secrets.pgPassword="${PG_PASSWORD}" ``` -To skip the wire setup when running the full deployment: +**Note:** For CI/CD deployments, the `sync-k8s-secret-to-wire-secrets.sh` script handles password synchronization automatically. + +#### **Password Verification** + +Verify password synchronization across all components: ```bash -ansible-playbook -i ansible/inventory/offline/hosts.ini ansible/postgresql-deploy.yml --skip-tags "wire-setup" +# Run the validation script +./bin/sync-k8s-secret-to-wire-server-values.sh ``` -#### Important Notes -- **Credential Security**: The generated password is displayed in the Ansible output. Ensure this output is securely stored and the password is updated in your Wire server configuration. +This checks: +- K8s secret `wire-postgresql-external-secret` exists and contains valid password +- Brig and Galley secrets in Kubernetes match the PostgreSQL password +- All components can connect to PostgreSQL -## Troubleshooting +--- -### Common Issues and Solutions +**🔐 Important Notes:** +- **Do NOT** manually set `wire_pass` in Ansible inventory - automatically managed via Kubernetes secrets +- **Source of Truth**: The Kubernetes secret `wire-postgresql-external-secret` is authoritative +- **Auto-Generated**: Passwords are randomly generated 32-character strings (high entropy) +- **Idempotent**: Running `sync-k8s-secret-to-wire-secrets.sh` multiple times is safe +- **CI/CD**: Password sync is automatic in offline deployment pipelines -#### 1. Replication Connection Issues -**Symptoms**: Replica cannot connect to primary -**Diagnosis**: -```bash -# Check network connectivity -telnet 5432 +## Kubernetes Integration -# Verify authentication -sudo -u postgres psql -h -U -d postgres -``` -**Solutions**: -- Verify `pg_hba.conf` entries for replication user -- Check firewall rules on primary node -- Validate replication user credentials - -#### 2. Replication Lag Issues -**Symptoms**: High replication lag or replicas falling behind -**Diagnosis**: -```sql --- Check WAL generation rate on primary -SELECT * FROM pg_stat_wal; - --- Monitor replication lag -SELECT * FROM pg_stat_replication; -``` -**Solutions**: -- Increase `wal_keep_size` on primary -- Check network bandwidth between nodes -- Optimize replica hardware resources - -#### 3. Wire Database Connection Issues -**Symptoms**: Wire server cannot connect to PostgreSQL database -**Diagnosis**: -```bash -# Test database connectivity -sudo -u postgres psql -d -U -h +This PostgreSQL HA cluster runs **independently outside Kubernetes** (on bare metal or VMs). For Kubernetes environments, the separate **postgres-endpoint-manager** component keeps PostgreSQL endpoints up to date: -# Check user privileges -sudo -u postgres psql -c "\du " -``` -**Solutions**: -- Verify database and user exist on primary node -- Check `pg_hba.conf` allows connections from Wire server hosts -- Validate credentials in Wire server configuration +- **Purpose**: Monitors PostgreSQL cluster state and updates Kubernetes service endpoints during failover +- **Repository**: [https://github.com/wireapp/postgres-endpoint-manager](https://github.com/wireapp/postgres-endpoint-manager) +- **Architecture**: Runs as a separate service that watches pg cluster events and updates Kubernetes services +- **Benefit**: Provides seamless failover transparency to containerized applications without cluster modification + +The PostgreSQL cluster operates independently, while the endpoint manager acts as an external observer that ensures Kubernetes applications always connect to the current primary node. diff --git a/values/wire-server/prod-secrets.example.yaml b/values/wire-server/prod-secrets.example.yaml index 04e44c932..c9872f7ed 100644 --- a/values/wire-server/prod-secrets.example.yaml +++ b/values/wire-server/prod-secrets.example.yaml @@ -1,7 +1,9 @@ # CHANGEME-PROD: All values here should be changed/reviewed brig: secrets: + #retrieve postgresql password from Kubernetes Secret with "kubectl get secret wire-postgresql-external-secret -n default -o jsonpath='{.data.password}' | base64 -d" pgPassword: verysecurepassword + smtpPassword: dummyPassword zAuth: # generate zauth public/private keys with the 'zauth' executable from wire-server: @@ -44,6 +46,7 @@ cargohold: galley: secrets: + #retrieve postgresql password from Kubernetes Secret with "kubectl get secret wire-postgresql-external-secret -n default -o jsonpath='{.data.password}' | base64 -d" pgPassword: verysecurepassword # these only need to be changed if using real AWS services awsKeyId: dummykey diff --git a/values/wire-server/prod-values.example.yaml b/values/wire-server/prod-values.example.yaml index e3d28a4ba..b9cae333e 100644 --- a/values/wire-server/prod-values.example.yaml +++ b/values/wire-server/prod-values.example.yaml @@ -127,7 +127,7 @@ cannon: # For demo mode only, we don't need to keep websocket connections open on chart upgrades drainTimeout: 10 config: - cassandra: + cassandra: host: cassandra-external metrics: serviceMonitor: