Skip to content
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
68db610
add pg failover automation with repmgr
sghosh23 Aug 28, 2025
79eb0ec
Add a drop-IN to guard the priamry auto start
sghosh23 Aug 28, 2025
743a97d
add monitoring to detect split-brain and organize the plabooks
sghosh23 Aug 29, 2025
ab07fc3
Update postgresql configuration and documentation
sghosh23 Aug 29, 2025
e09ac6a
Update the doc
sghosh23 Sep 9, 2025
57964fb
Merge branch 'master' into wpb-19318-pg-ha
sghosh23 Sep 9, 2025
ee0a531
fix: typo on repmger.conf and update playbooks
sghosh23 Sep 12, 2025
9321edd
debug: test deployment
sghosh23 Sep 15, 2025
5e57636
skip demo and mini build for now
sghosh23 Sep 15, 2025
759a7cf
fix: set the right dns-resolver
sghosh23 Sep 16, 2025
00197ec
feat: Enhance PostgreSQL HA cluster with unified config and comprehen…
sghosh23 Sep 18, 2025
b519d48
Merge branch 'master' into wpb-19318-pg-ha
sghosh23 Sep 19, 2025
53f1002
docs: Remove duplicate content from PostgreSQL HA documentation
sghosh23 Sep 19, 2025
bc4b4c3
docs: Clarify Kubernetes integration architecture
sghosh23 Sep 19, 2025
86a6e60
Optimize the doc
sghosh23 Sep 19, 2025
10391bf
Optimize the doc to have a cleaner order of texts
sghosh23 Sep 19, 2025
0d6347c
Update postgres document with full command paths
sghosh23 Sep 22, 2025
e39dc15
fix the repmgr reconnect time and adjust doc
sghosh23 Sep 25, 2025
d69f358
update document
sghosh23 Sep 25, 2025
06ad1e7
add postrgresql-external values file for the CI
sghosh23 Sep 26, 2025
7885fe5
add demo values
sghosh23 Sep 26, 2025
3bef7d0
Merge branch 'master' into wpb-19318-pg-ha
sghosh23 Oct 2, 2025
c9fdf7d
Update with different cluster recovery scenario
sghosh23 Oct 7, 2025
8c5ec6a
add instructions regarding rogue-detector and unmasking the pg service
sghosh23 Oct 9, 2025
26a96d9
store the postgresql secret as k8s secret
sghosh23 Oct 14, 2025
5d4e12b
optimize the password management section
sghosh23 Oct 14, 2025
46fd118
sync k8s secrets
sghosh23 Oct 14, 2025
c3226c8
refactor the sync command
sghosh23 Oct 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions .github/workflows/offline.yml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
on:
push:
branches: [master, develop]
tags: [ v* ]
tags: [v*]
paths-ignore:
- '*.md'
- '**/*.md'
- "*.md"
- "**/*.md"
pull_request:
branches: [master, develop]
paths-ignore:
- '*.md'
- '**/*.md'
- "*.md"
- "**/*.md"
jobs:
# Build default profile and create local assets
build-default:
Expand Down Expand Up @@ -167,16 +167,16 @@ jobs:
- name: Process the demo profile build
run: ./offline/demo-build/build.sh
env:
GPG_PRIVATE_KEY: '${{ secrets.GPG_PRIVATE_KEY }}'
DOCKER_LOGIN: '${{ secrets.DOCKER_LOGIN }}'
GPG_PRIVATE_KEY: "${{ secrets.GPG_PRIVATE_KEY }}"
DOCKER_LOGIN: "${{ secrets.DOCKER_LOGIN }}"

- name: Copy demo build assets tarball to S3
run: |
aws s3 cp offline/demo-build/output/assets.tgz s3://public.wire.com/artifacts/wire-server-deploy-static-demo-${{ steps.upload_name.outputs.UPLOAD_NAME }}.tgz
echo "Uploaded to: https://s3-$AWS_REGION.amazonaws.com/public.wire.com/artifacts/wire-server-deploy-static-demo-${{ steps.upload_name.outputs.UPLOAD_NAME }}.tgz"
env:
AWS_ACCESS_KEY_ID: '${{ secrets.AWS_ACCESS_KEY_ID }}'
AWS_SECRET_ACCESS_KEY: '${{ secrets.AWS_SECRET_ACCESS_KEY }}'
AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}"
AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}"
AWS_REGION: "eu-west-1"

- name: Cleanup demo build assets
Expand Down Expand Up @@ -208,16 +208,16 @@ jobs:
- name: Process the min profile build
run: ./offline/min-build/build.sh
env:
GPG_PRIVATE_KEY: '${{ secrets.GPG_PRIVATE_KEY }}'
DOCKER_LOGIN: '${{ secrets.DOCKER_LOGIN }}'
GPG_PRIVATE_KEY: "${{ secrets.GPG_PRIVATE_KEY }}"
DOCKER_LOGIN: "${{ secrets.DOCKER_LOGIN }}"

- name: Copy min build assets tarball to S3
run: |
aws s3 cp offline/min-build/output/assets.tgz s3://public.wire.com/artifacts/wire-server-deploy-static-min-${{ steps.upload_name.outputs.UPLOAD_NAME }}.tgz
echo "Uploaded to: https://s3-$AWS_REGION.amazonaws.com/public.wire.com/artifacts/wire-server-deploy-static-min-${{ steps.upload_name.outputs.UPLOAD_NAME }}.tgz"
env:
AWS_ACCESS_KEY_ID: '${{ secrets.AWS_ACCESS_KEY_ID }}'
AWS_SECRET_ACCESS_KEY: '${{ secrets.AWS_SECRET_ACCESS_KEY }}'
AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}"
AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}"
AWS_REGION: "eu-west-1"

- name: Cleanup min build assets
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ values-init-done

# Envrc local overrides
.envrc.local

.vscode
# Nix-created result symlinks
result
result-*
Expand Down
51 changes: 47 additions & 4 deletions ansible/inventory/offline/group_vars/postgresql/postgresql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,44 @@ postgresql_version: 17
postgresql_data_dir: /var/lib/postgresql/{{ postgresql_version }}/main
postgresql_conf_dir: /etc/postgresql/{{ postgresql_version }}/main

# Replication services configuration
repsvc_user: repsvc
repsvc_password: "securepassword"
repsvc_database: repsvc_db
# repmgr HA configuration
repmgr_user: repmgr
repmgr_password: "securepassword"
repmgr_database: repmgr

# Node configuration for repmgr
repmgr_node_config:
postgresql1: # Maps to postgresql_rw group
node_id: 1
priority: 150
role: primary
postgresql2: # Maps to first postgresql_ro
node_id: 2
priority: 100
role: standby
postgresql3: # Maps to second postgresql_ro
node_id: 3
priority: 50
role: standby

# repmgr settings
# repmgrd monitoring and reconnection configuration
# Reference: https://repmgr.org/docs/current/repmgrd-basic-configuration.html
#
# monitor_interval_secs: Interval in seconds between monitoring checks
# - Default: 2 seconds
# - Controls how frequently repmgr monitors the primary server status
#
# reconnect_attempts: Maximum number of reconnection attempts
# - Default: 6 attempts
# - Number of times repmgr will attempt to reconnect to a failed primary
#
# reconnect_interval: Interval in seconds between reconnection attempts
# - Default: 10 seconds
# - Time to wait between each reconnection attempt
monitor_interval_secs: 2
reconnect_attempts: 6
reconnect_interval: 5

# Use local packages instead of repository
postgresql_use_repository: false # Set to true to use local packages from urls
Expand Down Expand Up @@ -35,3 +69,12 @@ postgresql_pkgs:
- name: python3-psycopg2
url: "{{ binaries_url }}/python3-psycopg2_2.9.10-1.pgdg22.04+1_amd64.deb"
checksum: "sha256:cc2f749e3af292a67e012edeb4aa5d284f57f2d66a9a09fe5b81e5ffda73cab4"
- name: repmgr-common
url: "{{ binaries_url }}/repmgr-common_5.5.0+debpgdg-1.pgdg22.04+1_all.deb"
checksum: "sha256:34c660c66a9710fd4f20a66cc932741d3399dbba7e7ae4b67468b3e18f65f61c"
- name: repmgr
url: "{{ binaries_url }}/repmgr_5.5.0+debpgdg-1.pgdg22.04+1_all.deb"
checksum: "sha256:20c280811e758106335df1eb9954b61aa552823d3129f1e38c488fbd5efe0567"
- name: postgresql-17-repmgr
url: "{{ binaries_url }}/postgresql-17-repmgr_5.5.0+debpgdg-1.pgdg22.04+1_amd64.deb"
checksum: "sha256:520d6ed4d540a2bb9174ac8276f8cb686c0268c13cccb89b28a9cdbd12049df8"
12 changes: 12 additions & 0 deletions ansible/postgresql-deploy.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
- name: Clean previous deployment state
import_playbook: postgresql-playbooks/clean_existing_setup.yml
tags:
- postgresql
- cleanup

- name: Install PostgreSQL packages
import_playbook: postgresql-playbooks/postgresql-install.yml
tags:
Expand Down Expand Up @@ -27,3 +33,9 @@
tags:
- postgresql
- wire-setup

- name: Deploy cluster monitoring
import_playbook: postgresql-playbooks/postgresql-monitoring.yml
tags:
- postgresql
- monitoring
173 changes: 173 additions & 0 deletions ansible/postgresql-playbooks/clean_existing_setup.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
- name: Clean previous deployment state
hosts: "{{ target_nodes | default('postgresql_rw,postgresql_ro') }}"
become: yes
tasks:
# ===== DETECT INSTALLATION TYPE =====
- name: Check if PostgreSQL is installed
stat:
path: "/usr/bin/psql"
register: postgresql_installed

- name: Check if PostgreSQL data directory exists
stat:
path: "/var/lib/postgresql/{{ postgresql_version }}/main/PG_VERSION"
register: postgresql_data_exists

- name: Check if repmgr configuration exists
stat:
path: "/etc/repmgr/{{ postgresql_version }}-main/repmgr.conf"
register: repmgr_config_exists

- name: Determine if this is a fresh installation
set_fact:
is_fresh_install: >-
{{
not postgresql_installed.stat.exists or
not postgresql_data_exists.stat.exists or
not repmgr_config_exists.stat.exists
}}

- name: Display installation type
debug:
msg: |
{{ inventory_hostname }}: {{ 'Fresh installation detected - skipping most cleanup tasks' if is_fresh_install else 'Existing deployment detected - performing full cleanup' }}

# ===== FRESH INSTALLATION TASKS (MINIMAL) =====
- name: Handle fresh installation
block:
- name: Ensure basic directories exist for fresh install
file:
path: "{{ item }}"
state: directory
owner: postgres
group: postgres
mode: "0755"
loop:
- "/etc/repmgr/{{ postgresql_version }}-main"
- "/opt/repmgr/scripts"
- "/var/log/postgresql"
when: postgresql_installed.stat.exists

- name: Skip cleanup message for fresh install
debug:
msg: "Fresh installation - cleanup tasks skipped"

when: is_fresh_install

# ===== EXISTING DEPLOYMENT CLEANUP =====
- name: Handle existing deployment cleanup
block:
- name: Check if PostgreSQL service exists
systemd:
name: "postgresql@{{ postgresql_version }}-main.service"
register: postgresql_service_exists
failed_when: false

- name: Check if repmgr database exists
ansible.builtin.shell: |
sudo -u postgres psql -t -A -c "SELECT COUNT(*) FROM pg_database WHERE datname = '{{ repmgr_database }}'" postgres 2>/dev/null || echo "0"
register: repmgr_db_exists
changed_when: false
failed_when: false
when:
- postgresql_installed.stat.exists
- postgresql_service_exists.status is defined
- postgresql_service_exists.status.LoadState != "not-found"

- name: Drop repmgr database completely (if exists)
ansible.builtin.shell: |
sudo -u postgres psql -c "DROP DATABASE IF EXISTS {{ repmgr_database }};" postgres 2>/dev/null || true
failed_when: false
when:
- postgresql_installed.stat.exists
- repmgr_db_exists is defined
- repmgr_db_exists.stdout | default('0') | trim != '0'

- name: Stop any existing split-brain monitoring timer
systemd:
name: detect-rogue-primary.timer
state: stopped
failed_when: false

- name: Stop any existing split-brain monitoring service
systemd:
name: detect-rogue-primary.service
state: stopped
failed_when: false

- name: Stop any existing repmgrd service
systemd:
name: "repmgrd@{{ postgresql_version }}-main.service"
state: stopped
failed_when: false

- name: Unmask PostgreSQL services from previous deployments
systemd:
name: "postgresql@{{ postgresql_version }}-main.service"
masked: no
failed_when: false

- name: Stop PostgreSQL service for clean state
systemd:
name: "postgresql@{{ postgresql_version }}-main.service"
state: stopped
failed_when: false

- name: Remove repmgr configuration files, scripts, and systemd units
file:
path: "{{ item }}"
state: absent
failed_when: false
loop:
- "/etc/repmgr/{{ postgresql_version }}-main/repmgr.conf"
- "/etc/repmgr/{{ postgresql_version }}"
- "/etc/repmgr/{{ postgresql_version }}-main"
- "/var/lib/postgresql/{{ postgresql_version }}/main/recovery.conf"
- "/var/lib/postgresql/{{ postgresql_version }}/main/standby.signal"
- "/opt/repmgr/scripts"
- "/usr/local/bin/repmgr"
- "/usr/local/bin/repmgrd"
- "/usr/local/bin/detect_rogue_primary.sh"
- "/etc/systemd/system/detect-rogue-primary.service"
- "/etc/systemd/system/detect-rogue-primary.timer"
- "/etc/systemd/system/[email protected]"
- "/etc/systemd/system/repmgrd@{{ postgresql_version }}-main.service"
- "/etc/systemd/system/repmgrd@{{ postgresql_version }}.service"
- "/etc/sudoers.d/postgres-postgresql-management"
- "/etc/sudoers.d/postgres-postgresql-service"

- name: Find rogue split-brain service files
find:
paths: /etc/systemd/system
patterns: "detect-rogue-primary.service*"
register: rogue_service_files

- name: Remove rogue split-brain service files
file:
path: "{{ item.path }}"
state: absent
loop: "{{ rogue_service_files.files }}"
when: rogue_service_files.matched > 0

when: not is_fresh_install

# ===== COMMON TASKS FOR ALL INSTALLATIONS =====
- name: Reload systemd daemon after cleanup
systemd:
daemon_reload: yes
failed_when: false

- name: Display cleanup status
debug:
msg: |
Cleanup completed for {{ inventory_hostname }}:
- Installation type: {{ 'Fresh' if is_fresh_install else 'Existing' }}
- PostgreSQL installed: {{ postgresql_installed.stat.exists }}
- PostgreSQL data exists: {{ postgresql_data_exists.stat.exists }}
- repmgr config exists: {{ repmgr_config_exists.stat.exists }}
{% if is_fresh_install %}
- Action taken: Minimal setup (directories created)
{% else %}
- Action taken: Full cleanup (services stopped, configs removed)
{% endif %}
- Ready for deployment: ✅
Loading