diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 08cbb37f67..04d2565173 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -15,6 +15,7 @@ on: - '**.md' - .github/renovate.json5 - 'docs/**' + - 'terraform/**' schedule: - cron: '53 0 * * *' # Daily at 00:53 UTC # Triggered on push to branch "main" by .github/workflows/release.yaml diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 4d2e5449b5..12867cd3ec 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -19,19 +19,24 @@ jobs: uses: actions/checkout@v5 - name: Set up environment run: | - sudo snap install charmcraft --classic + sudo snap install go --classic + go install github.com/snapcore/spread/cmd/spread@latest pipx install tox poetry - name: Collect spread jobs id: collect-jobs shell: python run: | import json + import pathlib import os import subprocess spread_jobs = ( subprocess.run( - ["charmcraft", "test", "--list", "github-ci"], capture_output=True, check=True, text=True + [pathlib.Path.home() / "go/bin/spread", "-list", "github-ci"], + capture_output=True, + check=True, + text=True, ) .stdout.strip() .split("\n") diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 31b98e7fc5..9a2f4b6c10 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -1,11 +1,11 @@ # Copyright 2023 Canonical Ltd. # See LICENSE file for licensing details. -name: Release to Charmhub +name: Release to Charmhub edge on: push: branches: - - '16/edge' + - '*/edge' paths-ignore: - 'tests/**' - 'docs/**' @@ -16,6 +16,7 @@ on: - '.github/workflows/sync_docs.yaml' jobs: + ci-tests: name: Tests uses: ./.github/workflows/ci.yaml @@ -23,30 +24,13 @@ jobs: permissions: contents: write # Needed for Allure Report - release-libraries: - name: Release libraries - needs: - - ci-tests - runs-on: ubuntu-latest - timeout-minutes: 60 - steps: - - name: Checkout - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - name: Release charm libraries - uses: canonical/charming-actions/release-libraries@2.7.0 - with: - credentials: "${{ secrets.CHARMHUB_TOKEN }}" - github-token: "${{ secrets.GITHUB_TOKEN }}" - release: name: Release charm needs: - ci-tests uses: canonical/data-platform-workflows/.github/workflows/release_charm_edge.yaml@v35.0.2 with: - track: 16 + track: '16' artifact-prefix: ${{ needs.ci-tests.outputs.artifact-prefix }} secrets: charmhub-token: ${{ secrets.CHARMHUB_TOKEN }} diff --git a/actions.yaml b/actions.yaml index 84bbde6f6d..654eb85a1e 100644 --- a/actions.yaml +++ b/actions.yaml @@ -18,12 +18,43 @@ create-replication: type: string description: The name of the replication (defaults to 'default'). default: default +force-refresh-start: + description: | + Potential of data loss and downtime + + Force refresh of first unit + + Must run with at least one of the parameters `=false` + params: + check-compatibility: + type: boolean + default: true + description: | + Potential of data loss and downtime + + If `false`, force refresh if new version of PostgreSQL and/or charm is not compatible with previous version + run-pre-refresh-checks: + type: boolean + default: true + description: | + Potential of data loss and downtime + + If `false`, force refresh if app is unhealthy or not ready to refresh (and unit status shows "Pre-refresh check failed") + check-workload-container: + type: boolean + default: true + description: | + Potential of data loss and downtime during and after refresh + + If `false`, allow refresh to PostgreSQL container version that has not been validated to work with the charm revision + additionalProperties: false get-primary: description: Get the unit with is the primary/leader in the replication. list-backups: description: Lists backups in s3 storage in AWS. -pre-upgrade-check: - description: Run necessary pre-upgrade checks and preparations before executing a charm refresh. +pre-refresh-check: + description: Check if charm is ready to refresh + additionalProperties: false promote-to-primary: description: Promotes the cluster of choice to a primary cluster. Must be ran against the leader unit when promoting a cluster or against the unit to be promoted within the cluster. @@ -44,5 +75,27 @@ restore: restore-to-time: type: string description: Point-in-time-recovery target in PSQL format. +resume-refresh: + description: | + Refresh next unit(s) (after you have manually verified that refreshed units are healthy) + + If the `pause-after-unit-refresh` config is set to `all`, this action will refresh the next unit. + + If `pause-after-unit-refresh` is set to `first`, this action will refresh all remaining units. + Exception: if automatic health checks fail after a unit has refreshed, the refresh will pause. + + If `pause-after-unit-refresh` is set to `none`, this action will have no effect unless it is called with `check-health-of-refreshed-units` as `false`. + params: + check-health-of-refreshed-units: + type: boolean + default: true + description: | + Potential of data loss and downtime + + If `false`, force refresh (of next unit) if 1 or more refreshed units are unhealthy + + Warning: if first unit to refresh is unhealthy, consider running `force-refresh-start` action on that unit instead of using this parameter. + If first unit to refresh is unhealthy because compatibility checks, pre-refresh checks, or workload container checks are failing, this parameter is more destructive than the `force-refresh-start` action. + additionalProperties: false resume-upgrade: description: Resume a rolling upgrade after asserting successful upgrade of a new revision. diff --git a/charmcraft.yaml b/charmcraft.yaml index 14acff0663..b0e1d9ca48 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -86,6 +86,7 @@ parts: source: . stage: - LICENSE + - refresh_versions.toml - scripts - templates libpq: diff --git a/config.yaml b/config.yaml index a94e688d24..f015158001 100644 --- a/config.yaml +++ b/config.yaml @@ -390,6 +390,13 @@ options: Allowed values are: from 0 to 1.80E+308. type: float default: 0.1 + pause-after-unit-refresh: + description: | + Wait for manual confirmation to resume refresh after these units refresh + + Allowed values: "all", "first", "none" + type: string + default: first plugin_address_standardizer_data_us_enable: default: false type: boolean diff --git a/lib/charms/data_platform_libs/v0/upgrade.py b/lib/charms/data_platform_libs/v0/upgrade.py deleted file mode 100644 index f2ea143481..0000000000 --- a/lib/charms/data_platform_libs/v0/upgrade.py +++ /dev/null @@ -1,1109 +0,0 @@ -# Copyright 2023 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -r"""Library to manage in-place upgrades for charms running on VMs and K8s. - -This library contains handlers for `upgrade` relation events used to coordinate -between units in an application during a `juju refresh`, as well as `Pydantic` models -for instantiating, validating and comparing dependencies. - -An upgrade on VMs is initiated with the command `juju refresh`. Once executed, the following -events are emitted to each unit at random: - - `upgrade-charm` - - `config-changed` - - `leader-settings-changed` - Non-leader only - -Charm authors can implement the classes defined in this library to streamline the process of -coordinating which unit updates when, achieved through updating of unit-data `state` throughout. - -At a high-level, the upgrade steps are as follows: - - Run pre-checks on the cluster to confirm it is safe to upgrade - - Create stack of unit.ids, to serve as the upgrade order (generally workload leader is last) - - Start the upgrade by issuing a Juju CLI command - - The unit at the top of the stack gets permission to upgrade - - The unit handles the upgrade and restarts their service - - Repeat, until all units have restarted - -### Usage by charm authors - -#### `upgrade` relation - -Charm authors must implement an additional peer-relation. - -As this library uses relation data exchanged between units to coordinate, charm authors -need to add a new relation interface. The relation name does not matter. - -`metadata.yaml` -```yaml -peers: - upgrade: - interface: upgrade -``` - -#### Dependencies JSON/Dict - -Charm authors must implement a dict object tracking current charm versions, requirements + upgradability. - -Many workload versions may be incompatible with older/newer versions. This same idea also can apply to -charm or snap versions. Workloads with required related applications (e.g Kafka + ZooKeeper) also need to -ensure their versions are compatible during an upgrade, to avoid cluster failure. - -As such, it is necessasry to freeze any dependencies within each published charm. An example of this could -be creating a `DEPENDENCIES` dict within the charm code, with the following structure: - -`src/literals.py` -```python -DEPENDENCIES = { - "kafka_charm": { - "dependencies": {"zookeeper": ">50"}, - "name": "kafka", - "upgrade_supported": ">90", - "version": "100", - }, - "kafka_service": { - "dependencies": {"zookeeper": "^3"}, - "name": "kafka", - "upgrade_supported": ">=0.8", - "version": "3.3.2", - }, -} -``` - -The first-level key names are arbitrary labels for tracking what those versions+dependencies are for. -The `dependencies` second-level values are a key-value map of any required external applications, - and the versions this packaged charm can support. -The `upgrade_suppported` second-level values are requirements from which an in-place upgrade can be - supported by the charm. -The `version` second-level values correspond to the current version of this packaged charm. - -Any requirements comply with [`poetry`'s dependency specifications](https://python-poetry.org/docs/dependency-specification/#caret-requirements). - -### Dependency Model - -Charm authors must implement their own class inheriting from `DependencyModel`. - -Using a `Pydantic` model to instantiate the aforementioned `DEPENDENCIES` dict gives stronger type safety and additional -layers of validation. - -Implementation just needs to ensure that the top-level key names from `DEPENDENCIES` are defined as attributed in the model. - -`src/upgrade.py` -```python -from pydantic import BaseModel - -class KafkaDependenciesModel(BaseModel): - kafka_charm: DependencyModel - kafka_service: DependencyModel -``` - -### Overrides for `DataUpgrade` - -Charm authors must define their own class, inheriting from `DataUpgrade`, overriding all required `abstractmethod`s. - -```python -class ZooKeeperUpgrade(DataUpgrade): - def __init__(self, charm: "ZooKeeperUpgrade", **kwargs): - super().__init__(charm, **kwargs) - self.charm = charm -``` - -#### Implementation of `pre_upgrade_check()` - -Before upgrading a cluster, it's a good idea to check that it is stable and healthy before permitting it. -Here, charm authors can validate upgrade safety through API calls, relation-data checks, etc. -If any of these checks fail, raise `ClusterNotReadyError`. - -```python - @override - def pre_upgrade_check(self) -> None: - default_message = "Pre-upgrade check failed and cannot safely upgrade" - try: - if not self.client.members_broadcasting or not len(self.client.server_members) == len( - self.charm.cluster.peer_units - ): - raise ClusterNotReadyError( - message=default_message, - cause="Not all application units are connected and broadcasting in the quorum", - ) - - if self.client.members_syncing: - raise ClusterNotReadyError( - message=default_message, cause="Some quorum members are syncing data" - ) - - if not self.charm.cluster.stable: - raise ClusterNotReadyError( - message=default_message, cause="Charm has not finished initialising" - ) - - except QuorumLeaderNotFoundError: - raise ClusterNotReadyError(message=default_message, cause="Quorum leader not found") - except ConnectionClosedError: - raise ClusterNotReadyError( - message=default_message, cause="Unable to connect to the cluster" - ) -``` - -#### Implementation of `build_upgrade_stack()` - VM ONLY - -Oftentimes, it is necessary to ensure that the workload leader is the last unit to upgrade, -to ensure high-availability during the upgrade process. -Here, charm authors can create a LIFO stack of unit.ids, represented as a list of unit.id strings, -with the leader unit being at i[0]. - -```python -@override -def build_upgrade_stack(self) -> list[int]: - upgrade_stack = [] - for unit in self.charm.cluster.peer_units: - config = self.charm.cluster.unit_config(unit=unit) - - # upgrade quorum leader last - if config["host"] == self.client.leader: - upgrade_stack.insert(0, int(config["unit_id"])) - else: - upgrade_stack.append(int(config["unit_id"])) - - return upgrade_stack -``` - -#### Implementation of `_on_upgrade_granted()` - -On relation-changed events, each unit will check the current upgrade-stack persisted to relation data. -If that unit is at the top of the stack, it will emit an `upgrade-granted` event, which must be handled. -Here, workloads can be re-installed with new versions, checks can be made, data synced etc. -If the new unit successfully rejoined the cluster, call `set_unit_completed()`. -If the new unit failed to rejoin the cluster, call `set_unit_failed()`. - -NOTE - It is essential here to manually call `on_upgrade_changed` if the unit is the current leader. -This ensures that the leader gets it's own relation-changed event, and updates the upgrade-stack for -other units to follow suit. - -```python -@override -def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None: - self.charm.snap.stop_snap_service() - - if not self.charm.snap.install(): - logger.error("Unable to install ZooKeeper Snap") - self.set_unit_failed() - return None - - logger.info(f"{self.charm.unit.name} upgrading service...") - self.charm.snap.restart_snap_service() - - try: - logger.debug("Running post-upgrade check...") - self.pre_upgrade_check() - - logger.debug("Marking unit completed...") - self.set_unit_completed() - - # ensures leader gets it's own relation-changed when it upgrades - if self.charm.unit.is_leader(): - logger.debug("Re-emitting upgrade-changed on leader...") - self.on_upgrade_changed(event) - - except ClusterNotReadyError as e: - logger.error(e.cause) - self.set_unit_failed() -``` - -#### Implementation of `log_rollback_instructions()` - -If the upgrade fails, manual intervention may be required for cluster recovery. -Here, charm authors can log out any necessary steps to take to recover from a failed upgrade. -When a unit fails, this library will automatically log out this message. - -```python -@override -def log_rollback_instructions(self) -> None: - logger.error("Upgrade failed. Please run `juju refresh` to previous version.") -``` - -### Instantiating in the charm and deferring events - -Charm authors must add a class attribute for the child class of `DataUpgrade` in the main charm. -They must also ensure that any non-upgrade related events that may be unsafe to handle during -an upgrade, are deferred if the unit is not in the `idle` state - i.e not currently upgrading. - -```python -class ZooKeeperCharm(CharmBase): - def __init__(self, *args): - super().__init__(*args) - self.upgrade = ZooKeeperUpgrade( - self, - relation_name = "upgrade", - substrate = "vm", - dependency_model=ZooKeeperDependencyModel( - **DEPENDENCIES - ), - ) - - def restart(self, event) -> None: - if not self.upgrade.state == "idle": - event.defer() - return None - - self.restart_snap_service() -``` -""" - -import json -import logging -from abc import ABC, abstractmethod -from typing import Dict, List, Literal, Optional, Set, Tuple - -import poetry.core.constraints.version as poetry_version -from ops.charm import ( - ActionEvent, - CharmBase, - CharmEvents, - RelationCreatedEvent, - UpgradeCharmEvent, -) -from ops.framework import EventBase, EventSource, Object -from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, Relation, Unit, WaitingStatus - -try: - from pydantic.v1 import BaseModel, root_validator, validator -except ModuleNotFoundError: - from pydantic import BaseModel, root_validator, validator - -# The unique Charmhub library identifier, never change it -LIBID = "156258aefb79435a93d933409a8c8684" - -# Increment this major API version when introducing breaking changes -LIBAPI = 0 - -# Increment this PATCH version before using `charmcraft publish-lib` or reset -# to 0 if you are raising the major API version -LIBPATCH = 19 - -PYDEPS = ["pydantic>=1.10,<2", "poetry-core"] - -logger = logging.getLogger(__name__) - -# --- DEPENDENCY RESOLUTION FUNCTIONS --- - - -def verify_requirements(version: str, requirement: str) -> bool: - """Verifies a specified version against defined constraint. - - Supports Poetry version constraints - https://python-poetry.org/docs/dependency-specification/#version-constraints - - Args: - version: the version currently in use - requirement: Poetry version constraint - - Returns: - True if `version` meets defined `requirement`. Otherwise False - """ - return poetry_version.parse_constraint(requirement).allows( - poetry_version.Version.parse(version) - ) - - -# --- DEPENDENCY MODEL TYPES --- - - -class DependencyModel(BaseModel): - """Manager for a single dependency. - - To be used as part of another model representing a collection of arbitrary dependencies. - - Example:: - - class KafkaDependenciesModel(BaseModel): - kafka_charm: DependencyModel - kafka_service: DependencyModel - - deps = { - "kafka_charm": { - "dependencies": {"zookeeper": ">5"}, - "name": "kafka", - "upgrade_supported": ">5", - "version": "10", - }, - "kafka_service": { - "dependencies": {"zookeeper": "^3.6"}, - "name": "kafka", - "upgrade_supported": "~3.3", - "version": "3.3.2", - }, - } - - model = KafkaDependenciesModel(**deps) # loading dict in to model - - print(model.dict()) # exporting back validated deps - """ - - dependencies: Dict[str, str] - name: str - upgrade_supported: str - version: str - - @validator("dependencies", "upgrade_supported", each_item=True) - @classmethod - def dependencies_validator(cls, value): - """Validates version constraint.""" - if isinstance(value, dict): - deps = value.values() - else: - deps = [value] - - for dep in deps: - poetry_version.parse_constraint(dep) - - return value - - @root_validator(skip_on_failure=True) - @classmethod - def version_upgrade_supported_validator(cls, values): - """Validates specified `version` meets `upgrade_supported` requirement.""" - if not verify_requirements( - version=values.get("version"), requirement=values.get("upgrade_supported") - ): - raise ValueError( - f"upgrade_supported value {values.get('upgrade_supported')} greater than version value {values.get('version')} for {values.get('name')}." - ) - - return values - - def can_upgrade(self, dependency: "DependencyModel") -> bool: - """Compares two instances of :class:`DependencyModel` for upgradability. - - Args: - dependency: a dependency model to compare this model against - - Returns: - True if current model can upgrade from dependent model. Otherwise False - """ - return verify_requirements(version=self.version, requirement=dependency.upgrade_supported) - - -# --- CUSTOM EXCEPTIONS --- - - -class UpgradeError(Exception): - """Base class for upgrade related exceptions in the module.""" - - def __init__(self, message: str, cause: Optional[str], resolution: Optional[str]): - super().__init__(message) - self.message = message - self.cause = cause or "" - self.resolution = resolution or "" - - def __repr__(self): - """Representation of the UpgradeError class.""" - return f"{type(self).__module__}.{type(self).__name__} - {str(vars(self))}" - - def __str__(self): - """String representation of the UpgradeError class.""" - return repr(self) - - -class ClusterNotReadyError(UpgradeError): - """Exception flagging that the cluster is not ready to start upgrading. - - For example, if the cluster fails :class:`DataUpgrade._on_pre_upgrade_check_action` - - Args: - message: string message to be logged out - cause: short human-readable description of the cause of the error - resolution: short human-readable instructions for manual error resolution (optional) - """ - - def __init__(self, message: str, cause: str, resolution: Optional[str] = None): - super().__init__(message, cause=cause, resolution=resolution) - - -class KubernetesClientError(UpgradeError): - """Exception flagging that a call to Kubernetes API failed. - - For example, if the cluster fails :class:`DataUpgrade._set_rolling_update_partition` - - Args: - message: string message to be logged out - cause: short human-readable description of the cause of the error - resolution: short human-readable instructions for manual error resolution (optional) - """ - - def __init__(self, message: str, cause: str, resolution: Optional[str] = None): - super().__init__(message, cause=cause, resolution=resolution) - - -class VersionError(UpgradeError): - """Exception flagging that the old `version` fails to meet the new `upgrade_supported`s. - - For example, upgrades from version `2.x` --> `4.x`, - but `4.x` only supports upgrading from `3.x` onwards - - Args: - message: string message to be logged out - cause: short human-readable description of the cause of the error - resolution: short human-readable instructions for manual solutions to the error (optional) - """ - - def __init__(self, message: str, cause: str, resolution: Optional[str] = None): - super().__init__(message, cause=cause, resolution=resolution) - - -class DependencyError(UpgradeError): - """Exception flagging that some new `dependency` is not being met. - - For example, new version requires related App version `2.x`, but currently is `1.x` - - Args: - message: string message to be logged out - cause: short human-readable description of the cause of the error - resolution: short human-readable instructions for manual solutions to the error (optional) - """ - - def __init__(self, message: str, cause: str, resolution: Optional[str] = None): - super().__init__(message, cause=cause, resolution=resolution) - - -# --- CUSTOM EVENTS --- - - -class UpgradeGrantedEvent(EventBase): - """Used to tell units that they can process an upgrade.""" - - -class UpgradeFinishedEvent(EventBase): - """Used to tell units that they finished the upgrade.""" - - -class UpgradeEvents(CharmEvents): - """Upgrade events. - - This class defines the events that the lib can emit. - """ - - upgrade_granted = EventSource(UpgradeGrantedEvent) - upgrade_finished = EventSource(UpgradeFinishedEvent) - - -# --- EVENT HANDLER --- - - -class DataUpgrade(Object, ABC): - """Manages `upgrade` relation operations for in-place upgrades.""" - - STATES = ["recovery", "failed", "idle", "ready", "upgrading", "completed"] - - on = UpgradeEvents() # pyright: ignore [reportAssignmentType] - - def __init__( - self, - charm: CharmBase, - dependency_model: BaseModel, - relation_name: str = "upgrade", - substrate: Literal["vm", "k8s"] = "vm", - ): - super().__init__(charm, relation_name) - self.charm = charm - self.dependency_model = dependency_model - self.relation_name = relation_name - self.substrate = substrate - self._upgrade_stack = None - - # events - self.framework.observe( - self.charm.on[relation_name].relation_created, self._on_upgrade_created - ) - self.framework.observe( - self.charm.on[relation_name].relation_changed, self.on_upgrade_changed - ) - self.framework.observe(self.charm.on.upgrade_charm, self._on_upgrade_charm) - self.framework.observe(getattr(self.on, "upgrade_granted"), self._on_upgrade_granted) - self.framework.observe(getattr(self.on, "upgrade_finished"), self._on_upgrade_finished) - - # actions - self.framework.observe( - getattr(self.charm.on, "pre_upgrade_check_action"), self._on_pre_upgrade_check_action - ) - if self.substrate == "k8s": - self.framework.observe( - getattr(self.charm.on, "resume_upgrade_action"), self._on_resume_upgrade_action - ) - - @property - def peer_relation(self) -> Optional[Relation]: - """The upgrade peer relation.""" - return self.charm.model.get_relation(self.relation_name) - - @property - def app_units(self) -> Set[Unit]: - """The peer-related units in the application.""" - if not self.peer_relation: - return set() - - return set([self.charm.unit] + list(self.peer_relation.units)) - - @property - def state(self) -> Optional[str]: - """The unit state from the upgrade peer relation.""" - if not self.peer_relation: - return None - - return self.peer_relation.data[self.charm.unit].get("state", None) - - @property - def stored_dependencies(self) -> Optional[BaseModel]: - """The application dependencies from the upgrade peer relation.""" - if not self.peer_relation: - return None - - if not (deps := self.peer_relation.data[self.charm.app].get("dependencies", "")): - return None - - return type(self.dependency_model)(**json.loads(deps)) - - @property - def upgrade_stack(self) -> Optional[List[int]]: - """Gets the upgrade stack from the upgrade peer relation. - - Unit.ids are ordered Last-In-First-Out (LIFO). - i.e unit.id at index `-1` is the first unit to upgrade. - unit.id at index `0` is the last unit to upgrade. - - Returns: - List of integer unit.ids, ordered in upgrade order in a stack - """ - if not self.peer_relation: - return None - - # lazy-load - if self._upgrade_stack is None: - self._upgrade_stack = ( - json.loads(self.peer_relation.data[self.charm.app].get("upgrade-stack", "[]")) - or None - ) - - return self._upgrade_stack - - @upgrade_stack.setter - def upgrade_stack(self, stack: List[int]) -> None: - """Sets the upgrade stack to the upgrade peer relation. - - Unit.ids are ordered Last-In-First-Out (LIFO). - i.e unit.id at index `-1` is the first unit to upgrade. - unit.id at index `0` is the last unit to upgrade. - """ - if not self.peer_relation: - return - - self.peer_relation.data[self.charm.app].update({"upgrade-stack": json.dumps(stack)}) - self._upgrade_stack = stack - - @property - def other_unit_states(self) -> list: - """Current upgrade state for other units. - - Returns: - Unsorted list of upgrade states for other units. - """ - if not self.peer_relation: - return [] - - return [ - self.peer_relation.data[unit].get("state", "") - for unit in list(self.peer_relation.units) - ] - - @property - def unit_states(self) -> list: - """Current upgrade state for all units. - - Returns: - Unsorted list of upgrade states for all units. - """ - if not self.peer_relation: - return [] - - return [self.peer_relation.data[unit].get("state", "") for unit in self.app_units] - - @property - def cluster_state(self) -> Optional[str]: - """Current upgrade state for cluster units. - - Determined from :class:`DataUpgrade.STATE`, taking the lowest ordinal unit state. - - For example, if units in have states: `["ready", "upgrading", "completed"]`, - the overall state for the cluster is `ready`. - - Returns: - String of upgrade state from the furthest behind unit. - """ - if not self.unit_states: - return None - - try: - return sorted(self.unit_states, key=self.STATES.index)[0] - except (ValueError, KeyError): - return None - - @property - def idle(self) -> Optional[bool]: - """Flag for whether the cluster is in an idle upgrade state. - - Returns: - True if all application units in idle state. Otherwise False - """ - return set(self.unit_states) == {"idle"} - - @abstractmethod - def pre_upgrade_check(self) -> None: - """Runs necessary checks validating the cluster is in a healthy state to upgrade. - - Called by all units during :meth:`_on_pre_upgrade_check_action`. - - Raises: - :class:`ClusterNotReadyError`: if cluster is not ready to upgrade - """ - pass - - def build_upgrade_stack(self) -> List[int]: - """Builds ordered iterable of all application unit.ids to upgrade in. - - Called by leader unit during :meth:`_on_pre_upgrade_check_action`. - - Returns: - Iterable of integer unit.ids, LIFO ordered in upgrade order - i.e `[5, 2, 4, 1, 3]`, unit `3` upgrades first, `5` upgrades last - """ - # don't raise if k8s substrate, uses default statefulset order - if self.substrate == "k8s": - return [] - - raise NotImplementedError - - @abstractmethod - def log_rollback_instructions(self) -> None: - """Sets charm state and logs out rollback instructions. - - Called by all units when `state=failed` found during :meth:`_on_upgrade_changed`. - """ - pass - - def _repair_upgrade_stack(self) -> None: - """Ensures completed units are re-added to the upgrade-stack after failure.""" - # need to update the stack as it was not refreshed by rollback run of pre-upgrade-check - # avoids difficult health check implementation by charm-authors needing to exclude dead units - - # if the first unit in the stack fails, the stack will be the same length as units - # i.e this block not ran - if ( - self.cluster_state in ["failed", "recovery"] - and self.upgrade_stack - and len(self.upgrade_stack) != len(self.app_units) - and self.charm.unit.is_leader() - ): - new_stack = self.upgrade_stack - for unit in self.app_units: - unit_id = int(unit.name.split("/")[1]) - - # if a unit fails, it rolls back first - if unit_id not in new_stack: - new_stack.insert(-1, unit_id) - logger.debug(f"Inserted {unit_id} in to upgrade-stack - {new_stack}") - - self.upgrade_stack = new_stack - - def set_unit_failed(self, cause: Optional[str] = None) -> None: - """Sets unit `state=failed` to the upgrade peer data. - - Args: - cause: short description of cause of failure - """ - if not self.peer_relation: - return None - - # needed to refresh the stack - # now leader pulls a fresh stack from newly updated relation data - if self.charm.unit.is_leader(): - self._upgrade_stack = None - - self.charm.unit.status = BlockedStatus(cause if cause else "") - self.peer_relation.data[self.charm.unit].update({"state": "failed"}) - self.log_rollback_instructions() - - def set_unit_completed(self) -> None: - """Sets unit `state=completed` to the upgrade peer data.""" - if not self.peer_relation: - return None - - # needed to refresh the stack - # now leader pulls a fresh stack from newly updated relation data - if self.charm.unit.is_leader(): - self._upgrade_stack = None - - self.charm.unit.status = MaintenanceStatus("upgrade completed") - self.peer_relation.data[self.charm.unit].update({"state": "completed"}) - - # Emit upgrade_finished event to run unit's post upgrade operations. - if self.substrate == "k8s": - logger.debug( - f"{self.charm.unit.name} has completed the upgrade, emitting `upgrade_finished` event..." - ) - getattr(self.on, "upgrade_finished").emit() - - def _on_upgrade_created(self, event: RelationCreatedEvent) -> None: - """Handler for `upgrade-relation-created` events.""" - if not self.peer_relation: - event.defer() - return - - # setting initial idle state needed to avoid execution on upgrade-changed events - self.peer_relation.data[self.charm.unit].update({"state": "idle"}) - - if self.charm.unit.is_leader(): - logger.debug("Persisting dependencies to upgrade relation data...") - self.peer_relation.data[self.charm.app].update({ - "dependencies": json.dumps(self.dependency_model.dict()) - }) - - def _on_pre_upgrade_check_action(self, event: ActionEvent) -> None: - """Handler for `pre-upgrade-check-action` events.""" - if not self.peer_relation: - event.fail(message="Could not find upgrade relation.") - return - - if not self.charm.unit.is_leader(): - event.fail(message="Action must be ran on the Juju leader.") - return - - if self.cluster_state == "failed": - logger.info("Entering recovery state for rolling-back to previous version...") - self._repair_upgrade_stack() - self.charm.unit.status = BlockedStatus("ready to rollback application") - self.peer_relation.data[self.charm.unit].update({"state": "recovery"}) - return - - # checking if upgrade in progress - if self.cluster_state != "idle": - event.fail("Cannot run pre-upgrade checks, cluster already upgrading.") - return - - try: - logger.info("Running pre-upgrade-check...") - self.pre_upgrade_check() - - if self.substrate == "k8s": - logger.info("Building upgrade-stack for K8s...") - built_upgrade_stack = sorted([ - int(unit.name.split("/")[1]) for unit in self.app_units - ]) - else: - logger.info("Building upgrade-stack for VMs...") - built_upgrade_stack = self.build_upgrade_stack() - - logger.debug(f"Built upgrade stack of {built_upgrade_stack}") - - except ClusterNotReadyError as e: - logger.error(e) - event.fail(message=e.message) - return - except Exception as e: - logger.error(e) - event.fail(message="Unknown error found.") - return - - logger.info("Setting upgrade-stack to relation data...") - self.upgrade_stack = built_upgrade_stack - - def _on_resume_upgrade_action(self, event: ActionEvent) -> None: - """Handle resume upgrade action. - - Continue the upgrade by setting the partition to the next unit. - """ - if not self.peer_relation: - event.fail(message="Could not find upgrade relation.") - return - - if not self.charm.unit.is_leader(): - event.fail(message="Action must be ran on the Juju leader.") - return - - if not self.upgrade_stack: - event.fail(message="Nothing to resume, upgrade stack unset.") - return - - # Check whether this is being run after juju refresh was called - # (the size of the upgrade stack should match the number of total - # unit minus one). - if len(self.upgrade_stack) != len(self.peer_relation.units): - event.fail(message="Upgrade can be resumed only once after juju refresh is called.") - return - - try: - next_partition = self.upgrade_stack[-1] - self._set_rolling_update_partition(partition=next_partition) - event.set_results({"message": f"Upgrade will resume on unit {next_partition}"}) - except KubernetesClientError: - event.fail(message="Cannot set rolling update partition.") - - def _upgrade_supported_check(self) -> None: - """Checks if previous versions can be upgraded to new versions. - - Raises: - :class:`VersionError` if upgrading to existing `version` is not supported - """ - keys = self.dependency_model.__fields__.keys() - - compatible = True - incompatibilities: List[Tuple[str, str, str, str]] = [] - for key in keys: - old_dep: DependencyModel = getattr(self.stored_dependencies, key) - new_dep: DependencyModel = getattr(self.dependency_model, key) - - if not old_dep.can_upgrade(dependency=new_dep): - compatible = False - incompatibilities.append(( - key, - old_dep.version, - new_dep.version, - new_dep.upgrade_supported, - )) - - base_message = "Versions incompatible" - base_cause = "Upgrades only supported for specific versions" - if not compatible: - for incompat in incompatibilities: - base_message += ( - f", {incompat[0]} {incompat[1]} can not be upgraded to {incompat[2]}" - ) - base_cause += f", {incompat[0]} versions satisfying requirement {incompat[3]}" - - raise VersionError( - message=base_message, - cause=base_cause, - ) - - def _on_upgrade_charm(self, event: UpgradeCharmEvent) -> None: - """Handler for `upgrade-charm` events.""" - # defer if not all units have pre-upgraded - if not self.peer_relation: - event.defer() - return - - if not self.upgrade_stack: - logger.error("Cluster upgrade failed, ensure pre-upgrade checks are ran first.") - return - - if self.substrate == "vm": - # for VM run version checks on leader only - if self.charm.unit.is_leader(): - try: - self._upgrade_supported_check() - except VersionError as e: # not ready if not passed check - logger.error(e) - self.set_unit_failed() - return - top_unit_id = self.upgrade_stack[-1] - top_unit = self.charm.model.get_unit(f"{self.charm.app.name}/{top_unit_id}") - if ( - top_unit == self.charm.unit - and self.peer_relation.data[self.charm.unit].get("state") == "recovery" - ): - # While in a rollback and the Juju leader unit is the top unit in the upgrade stack, emit the event - # for this unit to start the rollback. - self.peer_relation.data[self.charm.unit].update({"state": "ready"}) - self.on_upgrade_changed(event) - return - self.charm.unit.status = WaitingStatus("other units upgrading first...") - self.peer_relation.data[self.charm.unit].update({"state": "ready"}) - - if len(self.app_units) == 1: - # single unit upgrade, emit upgrade_granted event right away - getattr(self.on, "upgrade_granted").emit() - - else: - # for k8s run version checks only on highest ordinal unit - if ( - self.charm.unit.name - == f"{self.charm.app.name}/{self.charm.app.planned_units() - 1}" - ): - try: - self._upgrade_supported_check() - except VersionError as e: # not ready if not passed check - logger.error(e) - self.set_unit_failed() - return - # On K8s an unit that receives the upgrade-charm event is upgrading - self.charm.unit.status = MaintenanceStatus("upgrading unit") - self.peer_relation.data[self.charm.unit].update({"state": "upgrading"}) - - def on_upgrade_changed(self, event: EventBase) -> None: - """Handler for `upgrade-relation-changed` events.""" - if not self.peer_relation: - return - - # if any other unit failed, don't continue with upgrade - if self.cluster_state == "failed": - logger.debug("Cluster failed to upgrade, exiting...") - return - - if self.substrate == "vm" and self.cluster_state == "recovery": - # skip run while in recovery. The event will be retrigged when the cluster is ready - logger.debug("Cluster in recovery, skip...") - return - - # if all units completed, mark as complete - if not self.upgrade_stack: - if self.state == "completed" and self.cluster_state in ["idle", "completed"]: - logger.info("All units completed upgrade, setting idle upgrade state...") - self.charm.unit.status = ActiveStatus() - self.peer_relation.data[self.charm.unit].update({"state": "idle"}) - - if self.charm.unit.is_leader(): - logger.debug("Persisting new dependencies to upgrade relation data...") - self.peer_relation.data[self.charm.app].update({ - "dependencies": json.dumps(self.dependency_model.dict()) - }) - return - - if self.cluster_state == "idle": - logger.debug("upgrade-changed event handled before pre-checks, exiting...") - return - - logger.debug("Did not find upgrade-stack or completed cluster state, skipping...") - return - - # upgrade ongoing, set status for waiting units - if "upgrading" in self.unit_states and self.state in ["idle", "ready"]: - self.charm.unit.status = WaitingStatus("other units upgrading first...") - - # pop mutates the `upgrade_stack` attr - top_unit_id = self.upgrade_stack.pop() - top_unit = self.charm.model.get_unit(f"{self.charm.app.name}/{top_unit_id}") - top_state = self.peer_relation.data[top_unit].get("state") - - # if top of stack is completed, leader pops it - if self.charm.unit.is_leader() and top_state == "completed": - logger.debug(f"{top_unit} has finished upgrading, updating stack...") - - # writes the mutated attr back to rel data - self.peer_relation.data[self.charm.app].update({ - "upgrade-stack": json.dumps(self.upgrade_stack) - }) - - # recurse on leader to ensure relation changed event not lost - # in case leader is next or the last unit to complete - self.on_upgrade_changed(event) - - # if unit top of stack and all units ready (i.e stack), emit granted event - if ( - self.charm.unit == top_unit - and top_state in ["ready", "upgrading"] - and self.cluster_state == "ready" - and "upgrading" not in self.other_unit_states - ): - logger.debug( - f"{top_unit.name} is next to upgrade, emitting `upgrade_granted` event and upgrading..." - ) - self.charm.unit.status = MaintenanceStatus("upgrading...") - self.peer_relation.data[self.charm.unit].update({"state": "upgrading"}) - - try: - getattr(self.on, "upgrade_granted").emit() - except DependencyError as e: - logger.error(e) - self.set_unit_failed() - return - - def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None: - """Handler for `upgrade-granted` events. - - Handlers of this event must meet the following: - - SHOULD check for related application deps from :class:`DataUpgrade.dependencies` - - MAY raise :class:`DependencyError` if dependency not met - - MUST update unit `state` after validating the success of the upgrade, calling one of: - - :class:`DataUpgrade.set_unit_failed` if the unit upgrade fails - - :class:`DataUpgrade.set_unit_completed` if the unit upgrade succeeds - - MUST call :class:`DataUpgarde.on_upgrade_changed` on exit so event not lost on leader - """ - # don't raise if k8s substrate, only return - if self.substrate == "k8s": - return - - raise NotImplementedError - - def _on_upgrade_finished(self, _) -> None: - """Handler for `upgrade-finished` events.""" - if self.substrate == "vm" or not self.peer_relation: - return - - # Emit the upgrade relation changed event in the leader to update the upgrade_stack. - if self.charm.unit.is_leader(): - self.charm.on[self.relation_name].relation_changed.emit( - self.model.get_relation(self.relation_name) - ) - - # This hook shouldn't run for the last unit (the first that is upgraded). For that unit it - # should be done through an action after the upgrade success on that unit is double-checked. - unit_number = int(self.charm.unit.name.split("/")[1]) - if unit_number == len(self.peer_relation.units): - logger.info( - f"{self.charm.unit.name} unit upgraded. Evaluate and run `resume-upgrade` action to continue upgrade" - ) - return - - # Also, the hook shouldn't run for the first unit (the last that is upgraded). - if unit_number == 0: - logger.info(f"{self.charm.unit.name} unit upgraded. Upgrade is complete") - return - - try: - # Use the unit number instead of the upgrade stack to avoid race conditions - # (i.e. the leader updates the upgrade stack after this hook runs). - next_partition = unit_number - 1 - logger.debug(f"Set rolling update partition to unit {next_partition}") - self._set_rolling_update_partition(partition=next_partition) - except KubernetesClientError: - logger.exception("Cannot set rolling update partition") - self.set_unit_failed() - self.log_rollback_instructions() - - def _set_rolling_update_partition(self, partition: int) -> None: - """Patch the StatefulSet's `spec.updateStrategy.rollingUpdate.partition`. - - Args: - partition: partition to set. - - K8s only. It should decrement the rolling update strategy partition by using a code - like the following: - - from lightkube.core.client import Client - from lightkube.core.exceptions import ApiError - from lightkube.resources.apps_v1 import StatefulSet - - try: - patch = {"spec": {"updateStrategy": {"rollingUpdate": {"partition": partition}}}} - Client().patch(StatefulSet, name=self.charm.model.app.name, namespace=self.charm.model.name, obj=patch) - logger.debug(f"Kubernetes StatefulSet partition set to {partition}") - except ApiError as e: - if e.status.code == 403: - cause = "`juju trust` needed" - else: - cause = str(e) - raise KubernetesClientError("Kubernetes StatefulSet patch failed", cause) - """ - if self.substrate == "vm": - return - - raise NotImplementedError diff --git a/metadata.yaml b/metadata.yaml index cb277d6f92..ae9ed6a2c4 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -46,10 +46,10 @@ resources: peers: database-peers: interface: postgresql_peers + refresh-v-three: + interface: refresh restart: interface: rolling_op - upgrade: - interface: upgrade provides: replication-offer: diff --git a/poetry.lock b/poetry.lock index efc1274337..4d13908627 100644 --- a/poetry.lock +++ b/poetry.lock @@ -424,6 +424,55 @@ files = [ [package.dependencies] pycparser = {version = "*", markers = "implementation_name != \"PyPy\""} +[[package]] +name = "charm-api" +version = "0.1.2" +description = "" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "charm_api-0.1.2-py3-none-any.whl", hash = "sha256:f76806f8d8e1e39ae4a812711350399168cd26b319a01e73a5816ef8fb9f7ed4"}, + {file = "charm_api-0.1.2.tar.gz", hash = "sha256:5d74418a3ffdee189dec1eaf648ba1bd7cff7449c3cdd2334a724e6722204c9d"}, +] + +[[package]] +name = "charm-json" +version = "0.1.1" +description = "" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "charm_json-0.1.1-py3-none-any.whl", hash = "sha256:a3fac62d45821d1a8c14058632e21333ec4e2cd41d0d00d6a73d00fc9a656eef"}, + {file = "charm_json-0.1.1.tar.gz", hash = "sha256:cb2eb24f6135d226ad04b0a17288ca2e027160d8af288083ef701bf4b137154e"}, +] + +[package.dependencies] +charm-api = ">=0.1.1" + +[[package]] +name = "charm-refresh" +version = "3.1.0.3" +description = "In-place rolling refreshes (upgrades) of stateful charmed applications" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "charm_refresh-3.1.0.3-py3-none-any.whl", hash = "sha256:3f7d871dad188177008c06b860b95c710054f146d8d708ba8090e0c1986380d8"}, + {file = "charm_refresh-3.1.0.3.tar.gz", hash = "sha256:c25992548d1d51a0fb1cab8de910e5dc8314c5f3e9a6d2f7ef950d81cb70a4b5"}, +] + +[package.dependencies] +charm-api = ">=0.1.1" +charm-json = ">=0.1.1" +httpx = ">=0.28.1" +lightkube = ">=0.15.4" +ops = ">=2.9.0" +packaging = ">=24.1" +pyyaml = ">=6.0.2" +tomli = ">=2.0.1" + [[package]] name = "charset-normalizer" version = "3.4.3" @@ -1541,7 +1590,7 @@ version = "25.0" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" -groups = ["integration", "unit"] +groups = ["main", "integration", "unit"] files = [ {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"}, {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"}, @@ -2616,6 +2665,70 @@ files = [ doc = ["reno", "sphinx"] test = ["pytest", "tornado (>=4.5)", "typeguard"] +[[package]] +name = "tomli" +version = "2.3.0" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.8" +groups = ["main", "integration"] +files = [ + {file = "tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45"}, + {file = "tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba"}, + {file = "tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf"}, + {file = "tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441"}, + {file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845"}, + {file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c"}, + {file = "tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456"}, + {file = "tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be"}, + {file = "tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac"}, + {file = "tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22"}, + {file = "tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f"}, + {file = "tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52"}, + {file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8"}, + {file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6"}, + {file = "tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876"}, + {file = "tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878"}, + {file = "tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b"}, + {file = "tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae"}, + {file = "tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b"}, + {file = "tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf"}, + {file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f"}, + {file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05"}, + {file = "tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606"}, + {file = "tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999"}, + {file = "tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e"}, + {file = "tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3"}, + {file = "tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc"}, + {file = "tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0"}, + {file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879"}, + {file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005"}, + {file = "tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463"}, + {file = "tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8"}, + {file = "tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77"}, + {file = "tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf"}, + {file = "tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530"}, + {file = "tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b"}, + {file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67"}, + {file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f"}, + {file = "tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0"}, + {file = "tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba"}, + {file = "tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b"}, + {file = "tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549"}, +] + +[[package]] +name = "tomli-w" +version = "1.2.0" +description = "A lil' TOML writer" +optional = false +python-versions = ">=3.9" +groups = ["integration"] +files = [ + {file = "tomli_w-1.2.0-py3-none-any.whl", hash = "sha256:188306098d013b691fcadc011abd66727d3c414c571bb01b1a174ba8c983cf90"}, + {file = "tomli_w-1.2.0.tar.gz", hash = "sha256:2dd14fac5a47c27be9cd4c976af5a12d87fb1f0b4512f81d69cce3b35ae25021"}, +] + [[package]] name = "toposort" version = "1.10" @@ -2927,4 +3040,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "4b1388713dd09659be0b0031aabb2f824e00723309cf8ade390aec1d3744e1d9" +content-hash = "78ec9c5086d12980a128cad181f6805cec9875e85f239fa57541d27f40382efb" diff --git a/pyproject.toml b/pyproject.toml index eeeafa818a..13b970325d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ lightkube = "^0.17.2" lightkube-models = "^1.28.1.4" psycopg2 = "^2.9.10" postgresql-charms-single-kernel = "16.1.0" +charm-refresh = "^3.1.0.2" [tool.poetry.group.charm-libs.dependencies] # data_platform_libs/v0/data_interfaces.py @@ -72,6 +73,8 @@ boto3 = "*" tenacity = "^9.1.2" allure-pytest = "^2.15.0" jubilant = "^1.4.0" +tomli-w = "^1.2.0" +tomli = "^2.3.0" [build-system] requires = ["poetry-core>=1.0.0"] @@ -137,7 +140,7 @@ notice-rgx = "Copyright\\s\\d{4}([-,]\\d{4})*\\s+" min-file-size = 1 [tool.ruff.lint.mccabe] -max-complexity = 10 +max-complexity = 12 [tool.ruff.lint.pydocstyle] convention = "google" diff --git a/refresh_versions.toml b/refresh_versions.toml new file mode 100644 index 0000000000..29ecfdcd9c --- /dev/null +++ b/refresh_versions.toml @@ -0,0 +1,5 @@ +# https://canonical-charm-refresh.readthedocs-hosted.com/latest/refresh-versions-toml/ + +charm = "16/1.0.0" +charm_major = 1 +workload = "16.10" diff --git a/src/backups.py b/src/backups.py index a32e69934b..7b04cb1962 100644 --- a/src/backups.py +++ b/src/backups.py @@ -7,6 +7,7 @@ import logging import os import re +import signal import tempfile import time from datetime import UTC, datetime @@ -26,7 +27,7 @@ from ops.framework import Object from ops.jujuversion import JujuVersion from ops.model import ActiveStatus, MaintenanceStatus -from ops.pebble import ChangeError, ExecError +from ops.pebble import ChangeError, ExecError, ServiceStatus from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed from constants import ( @@ -545,7 +546,7 @@ def _initialise_stanza(self, event: HookEvent) -> bool: event.defer() return False - self.charm.unit.status = MaintenanceStatus("initialising stanza") + self.charm.set_unit_status(MaintenanceStatus("initialising stanza")) # Create the stanza. try: @@ -589,7 +590,7 @@ def check_stanza(self) -> bool: # Update the configuration to use pgBackRest as the archiving mechanism. self.charm.update_config() - self.charm.unit.status = MaintenanceStatus("checking stanza") + self.charm.set_unit_status(MaintenanceStatus("checking stanza")) try: # If the tls is enabled, it requires all the units in the cluster to run the pgBackRest service to @@ -754,7 +755,7 @@ def _on_s3_credential_changed_primary(self, event: HookEvent) -> bool: return True - def _on_create_backup_action(self, event) -> None: # noqa: C901 + def _on_create_backup_action(self, event) -> None: """Request that pgBackRest creates a backup.""" backup_type = event.params.get("type", "full") if backup_type not in BACKUP_TYPE_OVERRIDES: @@ -810,7 +811,7 @@ def _on_create_backup_action(self, event) -> None: # noqa: C901 self._change_connectivity_to_database(connectivity=False) disabled_connectivity = True - self.charm.unit.status = MaintenanceStatus("creating backup") + self.charm.set_unit_status(MaintenanceStatus("creating backup")) # Set flag due to missing in progress backups on JSON output # (reference: https://github.com/pgbackrest/pgbackrest/issues/2007) self.charm.update_config(is_creating_backup=True) @@ -820,6 +821,8 @@ def _on_create_backup_action(self, event) -> None: # noqa: C901 "pgbackrest", f"--stanza={self.stanza_name}", "--log-level-console=debug", + "--log-level-file=debug", + "--log-subprocess", f"--type={BACKUP_TYPE_OVERRIDES[backup_type]}", "backup", ] @@ -884,7 +887,7 @@ def _on_create_backup_action(self, event) -> None: # noqa: C901 self._change_connectivity_to_database(connectivity=True) self.charm.update_config(is_creating_backup=False) - self.charm.unit.status = ActiveStatus() + self.charm.set_unit_status(ActiveStatus()) def _on_s3_credential_gone(self, _) -> None: self.container.stop(self.charm.rotate_logs_service) @@ -974,7 +977,7 @@ def _on_restore_action(self, event): # noqa: C901 f"Chosen timeline {restore_stanza_timeline[1]} as nearest for the specified timestamp {restore_to_time}" ) - self.charm.unit.status = MaintenanceStatus("restoring backup") + self.charm.set_unit_status(MaintenanceStatus("restoring backup")) # Temporarily disabling patroni (postgresql) pebble service auto-restart on failures. This is required # as point-in-time-recovery can fail on restore, therefore during cluster bootstrapping process. In this @@ -1085,7 +1088,7 @@ def _fetch_backup_from_id(self, backup_id: str) -> str | None: return None - def _pre_restore_checks(self, event: ActionEvent) -> bool: # noqa: C901 + def _pre_restore_checks(self, event: ActionEvent) -> bool: """Run some checks before starting the restore. Returns: @@ -1303,7 +1306,18 @@ def start_stop_pgbackrest_service(self) -> bool: return False # Start the service. - self.container.restart(self.charm.pgbackrest_server_service) + services = self.container.pebble.get_services(names=[self.charm.pgbackrest_server_service]) + if len(services) == 0: + return False + + if services[0].current == ServiceStatus.ACTIVE: + logger.debug("Sending SIGHUP to pgBackRest TLS server to reload configuration") + self.container.pebble.send_signal( + signal.SIGHUP, services=[self.charm.pgbackrest_server_service] + ) + else: + logger.debug("Starting pgBackRest TLS server service") + self.container.restart(self.charm.pgbackrest_server_service) return True def _upload_content_to_s3( diff --git a/src/charm.py b/src/charm.py index 1b36862d2b..8be61c8b03 100755 --- a/src/charm.py +++ b/src/charm.py @@ -8,6 +8,7 @@ import json import logging import os +import pathlib import re import shutil import sys @@ -23,6 +24,7 @@ AuthorisationRulesChangeCharmEvents, AuthorisationRulesObserver, ) +from refresh import PostgreSQLRefresh # First platform-specific import, will fail on wrong architecture try: @@ -39,6 +41,7 @@ main(WrongArchitectureWarningCharm, use_juju_for_storage=True) raise +import charm_refresh from charms.data_platform_libs.v0.data_interfaces import DataPeerData, DataPeerUnitData from charms.data_platform_libs.v1.data_models import TypedCharmBase from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider @@ -66,12 +69,14 @@ SecretChangedEvent, SecretNotFoundError, SecretRemoveEvent, + StatusBase, Unit, UnknownStatus, WaitingStatus, WorkloadEvent, main, ) +from ops.log import JujuLogHandler from ops.pebble import ( ChangeError, ExecError, @@ -152,10 +157,11 @@ from relations.postgresql_provider import PostgreSQLProvider from relations.tls import TLS from relations.tls_transfer import TLSTransfer -from upgrade import PostgreSQLUpgrade, get_postgresql_k8s_dependencies_model from utils import any_cpu_to_cores, any_memory_to_bytes, new_password logger = logging.getLogger(__name__) +logging.getLogger("httpx").setLevel(logging.WARNING) +logging.getLogger("httpcore").setLevel(logging.WARNING) EXTENSIONS_DEPENDENCY_MESSAGE = "Unsatisfied plugin dependencies. Please check the logs" EXTENSION_OBJECT_MESSAGE = "Cannot disable plugins: Existing objects depend on it. See logs" @@ -188,7 +194,6 @@ class CannotConnectError(Exception): PostgreSQLLDAP, PostgreSQLProvider, TLS, - PostgreSQLUpgrade, RollingOpsManager, ), ) @@ -201,15 +206,11 @@ class PostgresqlOperatorCharm(TypedCharmBase[CharmConfig]): def __init__(self, *args): super().__init__(*args) - # Support for disabling the operator. - disable_file = Path(f"{os.environ.get('CHARM_DIR')}/disable") - if disable_file.exists(): - logger.warning( - f"\n\tDisable file `{disable_file.resolve()}` found, the charm will skip all events." - "\n\tTo resume normal operations, please remove the file." - ) - self.unit.status = BlockedStatus("Disabled") - sys.exit(0) + # Show logger name (module name) in logs + root_logger = logging.getLogger() + for handler in root_logger.handlers: + if isinstance(handler, JujuLogHandler): + handler.setFormatter(logging.Formatter("{name}:{message}", style="{")) self.peer_relation_app = DataPeerData( self.model, @@ -251,18 +252,13 @@ def __init__(self, *args): self.framework.observe(self.on.promote_to_primary_action, self._on_promote_to_primary) self.framework.observe(self.on.get_primary_action, self._on_get_primary) self.framework.observe(self.on.update_status, self._on_update_status) + self.framework.observe(self.on.collect_unit_status, self._reconcile_refresh_status) self.framework.observe(self.on.secret_remove, self._on_secret_remove) self._certs_path = "/usr/local/share/ca-certificates" self._storage_path = str(self.meta.storages["data"].location) self.pgdata_path = f"{self._storage_path}/pgdata" - self.upgrade = PostgreSQLUpgrade( - self, - model=get_postgresql_k8s_dependencies_model(), - relation_name="upgrade", - substrate="k8s", - ) self.framework.observe(self.on.upgrade_charm, self._on_upgrade_charm) self.postgresql_client_relation = PostgreSQLProvider(self) self.backup = PostgreSQLBackups(self, "s3-parameters") @@ -274,6 +270,50 @@ def __init__(self, *args): self.restart_manager = RollingOpsManager( charm=self, relation="restart", callback=self._restart ) + + if self.model.juju_version.supports_open_port_on_k8s: + try: + self.unit.set_ports(5432, 8008) + except ModelError: + logger.exception("failed to open port") + + try: + self.refresh = charm_refresh.Kubernetes( + PostgreSQLRefresh( + workload_name="PostgreSQL", + charm_name="postgresql-k8s", + oci_resource_name="postgresql-image", + _charm=self, + ) + ) + except ( + charm_refresh.KubernetesJujuAppNotTrusted, + charm_refresh.PeerRelationNotReady, + charm_refresh.UnitTearingDown, + ): + self.refresh = None + self._reconcile_refresh_status() + + # Support for disabling the operator. + disable_file = Path(f"{os.environ.get('CHARM_DIR')}/disable") + if disable_file.exists(): + logger.warning( + f"\n\tDisable file `{disable_file.resolve()}` found, the charm will skip all events." + "\n\tTo resume normal operations, please remove the file." + ) + self.set_unit_status(BlockedStatus("Disabled")) + sys.exit(0) + + if ( + self.refresh is not None + and self.refresh.workload_allowed_to_start + and not self.refresh.next_unit_allowed_to_refresh + ): + if self.refresh.in_progress: + self.reconcile() + else: + self.refresh.next_unit_allowed_to_refresh = True + self._observer.start_authorisation_rules_observer() self.grafana_dashboards = GrafanaDashboardProvider(self) self.metrics_endpoint = MetricsEndpointProvider( @@ -286,16 +326,101 @@ def __init__(self, *args): logs_scheme={"postgresql": {"log-files": POSTGRES_LOG_FILES}}, relation_name="logging", ) - - if self.model.juju_version.supports_open_port_on_k8s: - try: - self.unit.set_ports(5432, 8008) - except ModelError: - logger.exception("failed to open port") self.tracing = TracingEndpointRequirer( self, relation_name=TRACING_RELATION_NAME, protocols=[TRACING_PROTOCOL] ) + def reconcile(self): + """Reconcile the unit state on refresh.""" + self.set_unit_status(MaintenanceStatus("starting services")) + self._update_pebble_layers(replan=True) + + if not self._patroni.member_started: + logger.debug("Early exit reconcile: Patroni has not started yet") + return + + if self.unit.is_leader() and not self._patroni.primary_endpoint_ready: + logger.debug( + "Early exit reconcile: current unit is leader but primary endpoint is not ready yet" + ) + return + + self.set_unit_status(WaitingStatus("waiting for database initialisation")) + try: + for attempt in Retrying(stop=stop_after_attempt(6), wait=wait_fixed(10)): + with attempt: + if not ( + self.unit.name.replace("/", "-") in self._patroni.cluster_members + and self._patroni.is_replication_healthy + ): + logger.error( + "Instance not yet back in the cluster or not healthy." + f" Retry {attempt.retry_state.attempt_number}/6" + ) + raise Exception + except RetryError: + logger.debug("Upgraded unit is not part of the cluster or not healthy") + self.set_unit_status( + BlockedStatus("upgrade failed. Check logs for rollback instruction") + ) + else: + if self.refresh is not None: + self.refresh.next_unit_allowed_to_refresh = True + self.set_unit_status(ActiveStatus()) + + def _reconcile_refresh_status(self, _=None): + if self.unit.is_leader(): + self.async_replication.set_app_status() + + # Workaround for other unit statuses being set in a stateful way (i.e. unable to recompute + # status on every event) + path = pathlib.Path(".last_refresh_unit_status.json") + try: + last_refresh_unit_status = json.loads(path.read_text()) + except FileNotFoundError: + last_refresh_unit_status = None + new_refresh_unit_status = None + if self.refresh is not None and self.refresh.unit_status_higher_priority: + self.unit.status = self.refresh.unit_status_higher_priority + new_refresh_unit_status = self.refresh.unit_status_higher_priority.message + elif self.unit.status.message == last_refresh_unit_status: + if self.refresh is not None and ( + refresh_status := self.refresh.unit_status_lower_priority() + ): + self.unit.status = refresh_status + new_refresh_unit_status = refresh_status.message + else: + # Clear refresh status from unit status + self._set_active_status() + elif ( + isinstance(self.unit.status, ActiveStatus) + and self.refresh is not None + and (refresh_status := self.refresh.unit_status_lower_priority()) + ): + self.unit.status = refresh_status + new_refresh_unit_status = refresh_status.message + path.write_text(json.dumps(new_refresh_unit_status)) + + def set_unit_status( + self, status: StatusBase, /, *, refresh: charm_refresh.Kubernetes | None = None + ): + """Set unit status without overriding higher priority refresh status.""" + if refresh is None: + refresh = getattr(self, "refresh", None) + if refresh is not None and refresh.unit_status_higher_priority: + return + if ( + isinstance(status, ActiveStatus) + and refresh is not None + and (refresh_status := refresh.unit_status_lower_priority()) + ): + self.unit.status = refresh_status + pathlib.Path(".last_refresh_unit_status.json").write_text( + json.dumps(refresh_status.message) + ) + return + self.unit.status = status + def _on_databases_change(self, _): """Handle databases change event.""" self.update_config() @@ -640,7 +765,7 @@ def _on_peer_relation_changed(self, event: HookEvent) -> None: # noqa: C901 try: self.update_config() except ValueError as e: - self.unit.status = BlockedStatus("Configuration Error. Please check the logs") + self.set_unit_status(BlockedStatus("Configuration Error. Please check the logs")) logger.error("Invalid configuration: %s", str(e)) return @@ -661,7 +786,7 @@ def _on_peer_relation_changed(self, event: HookEvent) -> None: # noqa: C901 # Validate the status of the member before setting an ActiveStatus. if not self._patroni.member_started: logger.debug("Deferring on_peer_relation_changed: Waiting for member to start") - self.unit.status = WaitingStatus("awaiting for member to start") + self.set_unit_status(WaitingStatus("awaiting for member to start")) event.defer() return @@ -715,15 +840,17 @@ def _on_secret_changed(self, event: SecretChangedEvent) -> None: except PostgreSQLUpdateUserPasswordError: event.defer() - def _on_config_changed(self, event) -> None: # noqa: C901 + def _on_config_changed(self, event) -> None: """Handle configuration changes, like enabling plugins.""" if not self.is_cluster_initialised: logger.debug("Defer on_config_changed: cluster not initialised yet") event.defer() return - if not self.upgrade.idle: - logger.debug("Defer on_config_changed: upgrade in progress") + if self.refresh is None: + logger.warning("Warning _on_config_changed: Refresh could be in progress") + elif self.refresh.in_progress: + logger.debug("Defer on_config_changed: Refresh in progress") event.defer() return @@ -736,7 +863,7 @@ def _on_config_changed(self, event) -> None: # noqa: C901 event.defer() return except ValueError as e: - self.unit.status = BlockedStatus("Configuration Error. Please check the logs") + self.set_unit_status(BlockedStatus("Configuration Error. Please check the logs")) logger.error("Invalid configuration: %s", str(e)) return if not self.updated_synchronous_node_count(): @@ -788,7 +915,7 @@ def enable_disable_extensions(self, database: str | None = None) -> None: continue extension = PLUGIN_OVERRIDES.get(extension, extension) if self._check_extension_dependencies(extension, enable): - self.unit.status = BlockedStatus(EXTENSIONS_DEPENDENCY_MESSAGE) + self.set_unit_status(BlockedStatus(EXTENSIONS_DEPENDENCY_MESSAGE)) return extensions[extension] = enable if self.is_blocked and self.unit.status.message == EXTENSIONS_DEPENDENCY_MESSAGE: @@ -800,7 +927,7 @@ def enable_disable_extensions(self, database: str | None = None) -> None: def _handle_enable_disable_extensions(self, original_status, extensions, database) -> None: """Try enablind/disabling Postgresql extensions and handle exceptions appropriately.""" if not isinstance(original_status, UnknownStatus): - self.unit.status = WaitingStatus("Updating extensions") + self.set_unit_status(WaitingStatus("Updating extensions")) try: self.postgresql.enable_disable_extensions(extensions, database) except psycopg2.errors.DependentObjectsStillExist as e: @@ -808,7 +935,7 @@ def _handle_enable_disable_extensions(self, original_status, extensions, databas "Failed to disable plugin: %s\nWas the plugin enabled manually? If so, update charm config with `juju config postgresql-k8s plugin__enable=True`", str(e), ) - self.unit.status = BlockedStatus(EXTENSION_OBJECT_MESSAGE) + self.set_unit_status(BlockedStatus(EXTENSION_OBJECT_MESSAGE)) return except PostgreSQLEnableDisableExtensionError as e: logger.exception("failed to change plugins: %s", str(e)) @@ -816,7 +943,7 @@ def _handle_enable_disable_extensions(self, original_status, extensions, databas self._set_active_status() return if not isinstance(original_status, UnknownStatus): - self.unit.status = original_status + self.set_unit_status(original_status) def _check_extension_dependencies(self, extension: str, enable: bool) -> bool: skip = False @@ -856,7 +983,7 @@ def _add_members(self, event) -> None: return logger.info("Reconfiguring cluster") - self.unit.status = MaintenanceStatus("reconfiguring cluster") + self.set_unit_status(MaintenanceStatus("reconfiguring cluster")) for member in self._hosts - self._patroni.cluster_members: logger.debug("Adding %s to cluster", member) self.add_cluster_member(member) @@ -889,7 +1016,7 @@ def add_cluster_member(self, member: str) -> None: self._patch_pod_labels(member) except ApiError as e: logger.error("failed to patch pod") - self.unit.status = BlockedStatus(f"failed to patch pod with error {e}") + self.set_unit_status(BlockedStatus(f"failed to patch pod with error {e}")) return @property @@ -928,7 +1055,7 @@ def _setup_passwords(self, event: LeaderElectedEvent) -> None: except (ModelError, SecretNotFoundError) as e: # only display the error but don't return to make sure all users have passwords logger.error(f"Error setting internal passwords: {e}") - self.unit.status = BlockedStatus("Password setting for system users failed.") + self.set_unit_status(BlockedStatus("Password setting for system users failed.")) event.defer() for password in { @@ -966,12 +1093,12 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None: return # Create resources and add labels needed for replication. - if self.upgrade.idle: + if self.refresh is not None and not self.refresh.in_progress: try: self._create_services() except ApiError: logger.exception("failed to create k8s services") - self.unit.status = BlockedStatus("failed to create k8s services") + self.set_unit_status(BlockedStatus("failed to create k8s services")) return # Remove departing units when the leader changes. @@ -1043,6 +1170,14 @@ def _on_start(self, _) -> None: def _on_postgresql_pebble_ready(self, event: WorkloadEvent) -> None: """Event handler for PostgreSQL container on PebbleReadyEvent.""" + # Safeguard against starting while refreshing. + if self.refresh is None: + logger.warning("Warning on_postgresql_pebble_ready: Refresh could be in progress") + elif self.refresh.in_progress: + logger.debug("Defer on_postgresql_pebble_ready: Refresh in progress") + event.defer() + return + if self._endpoint in self._endpoints: self._fix_pod() @@ -1061,8 +1196,6 @@ def _on_postgresql_pebble_ready(self, event: WorkloadEvent) -> None: # where the volume is mounted with more restrictive permissions. self._create_pgdata(container) - self.unit.set_workload_version(self._patroni.rock_postgresql_version) - # Defer the initialization of the workload in the replicas # if the cluster hasn't been bootstrap on the primary yet. # Otherwise, each unit will create a different cluster and @@ -1098,7 +1231,7 @@ def _on_postgresql_pebble_ready(self, event: WorkloadEvent) -> None: # Ensure the member is up and running before marking the cluster as initialised. if not self._patroni.member_started: logger.debug("Deferring on_postgresql_pebble_ready: Waiting for cluster to start") - self.unit.status = WaitingStatus("awaiting for cluster to start") + self.set_unit_status(WaitingStatus("awaiting for cluster to start")) event.defer() return @@ -1124,15 +1257,15 @@ def _set_active_status(self): return try: if self.unit.is_leader() and "s3-initialization-block-message" in self.app_peer_data: - self.unit.status = BlockedStatus( - self.app_peer_data["s3-initialization-block-message"] + self.set_unit_status( + BlockedStatus(self.app_peer_data["s3-initialization-block-message"]) ) return if self.unit.is_leader() and ( self.app_peer_data.get("logical-replication-validation") == "error" or self.logical_replication.has_remote_publisher_errors() ): - self.unit.status = BlockedStatus(LOGICAL_REPLICATION_VALIDATION_ERROR_STATUS) + self.set_unit_status(BlockedStatus(LOGICAL_REPLICATION_VALIDATION_ERROR_STATUS)) return if ( self._patroni.get_primary(unit_name_pattern=True) == self.unit.name @@ -1141,11 +1274,13 @@ def _set_active_status(self): danger_state = "" if len(self._patroni.get_running_cluster_members()) < self.app.planned_units(): danger_state = " (degraded)" - self.unit.status = ActiveStatus( - f"{'Standby' if self.is_standby_leader else 'Primary'}{danger_state}" + self.set_unit_status( + ActiveStatus( + f"{'Standby' if self.is_standby_leader else 'Primary'}{danger_state}" + ) ) elif self._patroni.member_started: - self.unit.status = ActiveStatus() + self.set_unit_status(ActiveStatus()) except (RetryError, RequestsConnectionError) as e: logger.error(f"failed to get primary with error {e}") @@ -1156,16 +1291,16 @@ def _initialize_cluster(self, event: HookEvent) -> bool: self._patch_pod_labels(self._unit) except ApiError as e: logger.error("failed to patch pod") - self.unit.status = BlockedStatus(f"failed to patch pod with error {e}") + self.set_unit_status(BlockedStatus(f"failed to patch pod with error {e}")) return False # Create resources and add labels needed for replication - if self.upgrade.idle: + if self.refresh is not None and not self.refresh.in_progress: try: self._create_services() except ApiError: logger.exception("failed to create k8s services") - self.unit.status = BlockedStatus("failed to create k8s services") + self.set_unit_status(BlockedStatus("failed to create k8s services")) return False async_replication_primary_cluster = self.async_replication.get_primary_cluster() @@ -1182,7 +1317,7 @@ def _initialize_cluster(self, event: HookEvent) -> bool: logger.debug( "Deferring on_postgresql_pebble_ready: Waiting for primary endpoint to be ready" ) - self.unit.status = WaitingStatus("awaiting for primary endpoint to be ready") + self.set_unit_status(WaitingStatus("awaiting for primary endpoint to be ready")) event.defer() return False @@ -1191,17 +1326,17 @@ def _initialize_cluster(self, event: HookEvent) -> bool: except PostgreSQLCreatePredefinedRolesError: message = "Failed to create pre-defined roles" logger.exception(message) - self.unit.status = BlockedStatus(message) + self.set_unit_status(BlockedStatus(message)) return False except PostgreSQLGrantDatabasePrivilegesToUserError: message = "Failed to grant database privileges to user" logger.exception(message) - self.unit.status = BlockedStatus(message) + self.set_unit_status(BlockedStatus(message)) return False except PostgreSQLCreateUserError: message = "Failed to create postgres user" logger.exception(message) - self.unit.status = BlockedStatus(message) + self.set_unit_status(BlockedStatus(message)) return False except PostgreSQLListUsersError: logger.warning("Deferring on_start: Unable to list users") @@ -1388,7 +1523,7 @@ def _update_admin_password(self, admin_secret_id: str) -> None: logger.error( "Failed changing the password: This can be ran only in the cluster from the offer side." ) - self.unit.status = BlockedStatus("Password update for system users failed.") + self.set_unit_status(BlockedStatus("Password update for system users failed.")) return try: @@ -1406,7 +1541,7 @@ def _update_admin_password(self, admin_secret_id: str) -> None: updated_passwords.pop(user) except (ModelError, SecretNotFoundError) as e: logger.error(f"Error updating internal passwords: {e}") - self.unit.status = BlockedStatus("Password update for system users failed.") + self.set_unit_status(BlockedStatus("Password update for system users failed.")) return try: @@ -1422,7 +1557,7 @@ def _update_admin_password(self, admin_secret_id: str) -> None: self.set_secret(APP_SCOPE, f"{user}-password", password) except PostgreSQLUpdateUserPasswordError as e: logger.exception(e) - self.unit.status = BlockedStatus("Password update for system users failed.") + self.set_unit_status(BlockedStatus("Password update for system users failed.")) return # Update and reload Patroni configuration in this unit to use the new password. @@ -1472,19 +1607,19 @@ def _fix_pod(self) -> None: # Recreate k8s resources and add labels required for replication # when the pod loses them (like when it's deleted). self.push_tls_files_to_workload() - if self.upgrade.idle: + if self.refresh is not None and not self.refresh.in_progress: try: self._create_services() except ApiError: logger.exception("failed to create k8s services") - self.unit.status = BlockedStatus("failed to create k8s services") + self.set_unit_status(BlockedStatus("failed to create k8s services")) return try: self._patch_pod_labels(self.unit.name) except ApiError as e: logger.error("failed to patch pod") - self.unit.status = BlockedStatus(f"failed to patch pod with error {e}") + self.set_unit_status(BlockedStatus(f"failed to patch pod with error {e}")) return # Update the sync-standby endpoint in the async replication data. @@ -1568,8 +1703,11 @@ def _on_stop(self, _): ) def _on_update_status_early_exit_checks(self, container) -> bool: - if not self.upgrade.idle: - logger.debug("Early exit on_update_status: upgrade in progress") + if self.refresh is None: + logger.debug("Early exit on_update_status: Refresh could be in progress") + return False + if self.refresh.in_progress: + logger.debug("Early exit on_update_status: Refresh in progress") return False if not container.can_connect(): @@ -1610,9 +1748,9 @@ def _check_pgdata_storage_size(self) -> None: free_size / total_size, ) if free_size / total_size < 0.1: - self.unit.status = BlockedStatus(INSUFFICIENT_SIZE_WARNING) + self.set_unit_status(BlockedStatus(INSUFFICIENT_SIZE_WARNING)) elif self.unit.status.message == INSUFFICIENT_SIZE_WARNING: - self.unit.status = ActiveStatus() + self.set_unit_status(ActiveStatus()) self._set_active_status() def _on_update_status(self, _) -> None: @@ -1641,7 +1779,7 @@ def _on_update_status(self, _) -> None: logger.exception("Failed to restart patroni") # If service doesn't recover fast, exit and wait for next hook run to re-check if not self._patroni.member_started: - self.unit.status = MaintenanceStatus("Database service inactive, restarting") + self.set_unit_status(MaintenanceStatus("Database service inactive, restarting")) return if ( @@ -1666,7 +1804,7 @@ def _was_restore_successful(self, container: Container, service: ServiceInfo) -> "You can launch another restore with different parameters" ) self.log_pitr_last_transaction_time() - self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR) + self.set_unit_status(BlockedStatus(CANNOT_RESTORE_PITR)) return False if ( @@ -1674,7 +1812,7 @@ def _was_restore_successful(self, container: Container, service: ServiceInfo) -> and self.unit.status.message != CANNOT_RESTORE_PITR ): logger.error("Restore failed: database service failed to start") - self.unit.status = BlockedStatus("Failed to restore backup") + self.set_unit_status(BlockedStatus("Failed to restore backup")) return False if not self._patroni.member_started: @@ -2045,7 +2183,7 @@ def _restart(self, event: RunWithLock) -> None: except RetryError: error_message = "failed to restart PostgreSQL" logger.exception(error_message) - self.unit.status = BlockedStatus(error_message) + self.set_unit_status(BlockedStatus(error_message)) return # Update health check URL. @@ -2160,7 +2298,6 @@ def update_config(self, is_creating_backup: bool = False) -> bool: is_creating_backup=is_creating_backup, enable_ldap=self.is_ldap_enabled, enable_tls=self.is_tls_enabled, - is_no_sync_member=self.upgrade.is_no_sync_member, backup_id=self.app_peer_data.get("restoring-backup"), pitr_target=self.app_peer_data.get("restore-to-time"), restore_timeline=self.app_peer_data.get("restore-timeline"), @@ -2377,8 +2514,10 @@ def get_available_resources(self) -> tuple[int, int]: def on_deployed_without_trust(self) -> None: """Blocks the application and returns a specific error message for deployments made without --trust.""" - self.unit.status = BlockedStatus( - f"Insufficient permissions, try: `juju trust {self._name} --scope=cluster`" + self.set_unit_status( + BlockedStatus( + f"Insufficient permissions, try: `juju trust {self._name} --scope=cluster`" + ) ) logger.error( f""" diff --git a/src/dependency.json b/src/dependency.json deleted file mode 100644 index 1f87a03a6d..0000000000 --- a/src/dependency.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "charm": { - "dependencies": {"pgbouncer": ">0"}, - "name": "postgresql", - "upgrade_supported": ">0", - "version": "2" - }, - "rock": { - "dependencies": {}, - "name": "charmed-postgresql", - "upgrade_supported": "^16", - "version": "16.6" - } -} diff --git a/src/ldap.py b/src/ldap.py index a6804cbca7..2ed6e8f012 100644 --- a/src/ldap.py +++ b/src/ldap.py @@ -44,7 +44,7 @@ def _on_ldap_ready(self, _: LdapReadyEvent) -> None: self.charm.app_peer_data.update({"ldap_enabled": "True"}) self.charm.update_config() - self.charm.unit.status = ActiveStatus() + self.charm.set_unit_status(ActiveStatus()) def _on_ldap_unavailable(self, _: LdapUnavailableEvent) -> None: """Handler for the LDAP unavailable event.""" diff --git a/src/refresh.py b/src/refresh.py new file mode 100644 index 0000000000..3620241160 --- /dev/null +++ b/src/refresh.py @@ -0,0 +1,69 @@ +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Refresh logic for postgresql-k8s operator charm.""" + +import dataclasses +import logging +from typing import TYPE_CHECKING + +import charm_refresh +from charm_refresh import CharmSpecificKubernetes, CharmVersion + +if TYPE_CHECKING: + from charm import PostgresqlOperatorCharm + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass(eq=False) +class PostgreSQLRefresh(CharmSpecificKubernetes): + """Base class for PostgreSQL refresh operations.""" + + _charm: "PostgresqlOperatorCharm" + + @classmethod + def is_compatible( + cls, + *, + old_charm_version: CharmVersion, + new_charm_version: CharmVersion, + old_workload_version: str, + new_workload_version: str, + ) -> bool: + """Checks charm version compatibility.""" + if not super().is_compatible( + old_charm_version=old_charm_version, + new_charm_version=new_charm_version, + old_workload_version=old_workload_version, + new_workload_version=new_workload_version, + ): + return False + + # Check workload version compatibility + old_major, old_minor = (int(component) for component in old_workload_version.split(".")) + new_major, new_minor = (int(component) for component in new_workload_version.split(".")) + if old_major != new_major: + return False + return new_minor >= old_minor + + def run_pre_refresh_checks_after_1_unit_refreshed(self) -> None: + """Implement pre-refresh checks after 1 unit refreshed.""" + logger.debug("Running pre-refresh checks") + if not self._charm._patroni.are_all_members_ready(): + raise charm_refresh.PrecheckFailed("Not all members are ready yet.") + if self._charm._patroni.is_creating_backup: + raise charm_refresh.PrecheckFailed("A backup is being created.") + + def run_pre_refresh_checks_before_any_units_refreshed(self) -> None: + """Implement pre-refresh checks before any unit refreshed.""" + self.run_pre_refresh_checks_after_1_unit_refreshed() + + # If the first unit is not the primary we ask the user to switchover + # the primary to it. + primary_unit_name = self._charm._patroni.get_primary(unit_name_pattern=True) + unit_zero_name = f"{self._charm.app.name}/0" + if primary_unit_name != unit_zero_name: + raise charm_refresh.PrecheckFailed( + f"Switch primary to {unit_zero_name} to avoid multiple switchovers during refresh." + ) diff --git a/src/relations/async_replication.py b/src/relations/async_replication.py index 426bae6552..d05738282d 100644 --- a/src/relations/async_replication.py +++ b/src/relations/async_replication.py @@ -131,7 +131,7 @@ def _can_promote_cluster(self, event: ActionEvent) -> bool: self.charm._peers.data[self.charm.app].update({ "promoted-cluster-counter": "" }) - self._set_app_status() + self.set_app_status() self.charm._set_active_status() except (StandbyClusterAlreadyPromotedError, ClusterNotPromotedError) as e: event.fail(str(e)) @@ -370,8 +370,8 @@ def _handle_database_start(self, event: RelationChangedEvent) -> None: "cluster_initialised": "True" }) elif self._is_following_promoted_cluster(): - self.charm.unit.status = WaitingStatus( - "Waiting for the database to be started in all units" + self.charm.set_unit_status( + WaitingStatus("Waiting for the database to be started in all units") ) event.defer() return @@ -382,12 +382,12 @@ def _handle_database_start(self, event: RelationChangedEvent) -> None: else: # If the standby leader fails to start, fix the leader annotation and defer the event. self.charm.fix_leader_annotation() - self.charm.unit.status = WaitingStatus( - "Still starting the database in the standby leader" + self.charm.set_unit_status( + WaitingStatus("Still starting the database in the standby leader") ) event.defer() except NotReadyError: - self.charm.unit.status = WaitingStatus("Waiting for the database to start") + self.charm.set_unit_status(WaitingStatus("Waiting for the database to start")) logger.debug("Deferring on_async_relation_changed: database hasn't started yet.") event.defer() @@ -422,7 +422,7 @@ def handle_read_only_mode(self) -> None: self.charm._set_active_status() if self.charm.unit.is_leader(): - self._set_app_status() + self.set_app_status() def _handle_replication_change(self, event: ActionEvent) -> bool: if not self._can_promote_cluster(event): @@ -488,7 +488,7 @@ def _on_async_relation_broken(self, _) -> None: if self.charm._patroni.get_standby_leader() is not None: if self.charm.unit.is_leader(): self.charm._peers.data[self.charm.app].update({"promoted-cluster-counter": "0"}) - self._set_app_status() + self.set_app_status() else: if self.charm.unit.is_leader(): self.charm._peers.data[self.charm.app].update({"promoted-cluster-counter": ""}) @@ -497,7 +497,7 @@ def _on_async_relation_broken(self, _) -> None: def _on_async_relation_changed(self, event: RelationChangedEvent) -> None: """Update the Patroni configuration if one of the clusters was already promoted.""" if self.charm.unit.is_leader(): - self._set_app_status() + self.set_app_status() primary_cluster = self.get_primary_cluster() logger.debug("Primary cluster: %s", primary_cluster) @@ -522,8 +522,8 @@ def _on_async_relation_changed(self, event: RelationChangedEvent) -> None: == self._get_highest_promoted_cluster_counter_value() for unit in self.charm._peers.units ): - self.charm.unit.status = WaitingStatus( - "Waiting for the database to be stopped in all units" + self.charm.set_unit_status( + WaitingStatus("Waiting for the database to be stopped in all units") ) logger.debug("Deferring on_async_relation_changed: not all units stopped.") event.defer() @@ -580,7 +580,7 @@ def _on_create_replication(self, event: ActionEvent) -> None: self._relation.data[self.charm.app].update({"name": event.params["name"]}) # type: ignore # Set the status. - self.charm.unit.status = MaintenanceStatus("Creating replication...") + self.charm.set_unit_status(MaintenanceStatus("Creating replication...")) def promote_to_primary(self, event: ActionEvent) -> None: """Promote this cluster to the primary cluster.""" @@ -597,7 +597,7 @@ def promote_to_primary(self, event: ActionEvent) -> None: return # Set the status. - self.charm.unit.status = MaintenanceStatus("Promoting cluster...") + self.charm.set_unit_status(MaintenanceStatus("Promoting cluster...")) def _on_secret_changed(self, event: SecretChangedEvent) -> None: """Update the internal secret when the relation secret changes.""" @@ -680,8 +680,13 @@ def _remove_previous_cluster_information(self) -> None: raise e logger.debug(f"{values[0]} {values[1]} not found") - def _set_app_status(self) -> None: + def set_app_status(self) -> None: """Set the app status.""" + if self.charm.refresh is not None and self.charm.refresh.app_status_higher_priority: + self.charm.app.status = self.charm.refresh.app_status_higher_priority + return + if self.charm._peers is None: + return if self.charm._peers.data[self.charm.app].get("promoted-cluster-counter") == "0": self.charm.app.status = BlockedStatus(READ_ONLY_MODE_BLOCKING_MESSAGE) return @@ -795,8 +800,8 @@ def _wait_for_standby_leader(self, event: RelationChangedEvent) -> bool: except RetryError: standby_leader = None if not self.charm.unit.is_leader() and standby_leader is None: - self.charm.unit.status = WaitingStatus( - "Waiting for the standby leader start the database" + self.charm.set_unit_status( + WaitingStatus("Waiting for the standby leader start the database") ) logger.debug("Deferring on_async_relation_changed: standby leader hasn't started yet.") event.defer() diff --git a/src/relations/logical_replication.py b/src/relations/logical_replication.py index 24239442f6..fbaa04e059 100644 --- a/src/relations/logical_replication.py +++ b/src/relations/logical_replication.py @@ -202,7 +202,7 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None: logger.error( f"Got logical replication error from the publisher in {LOGICAL_REPLICATION_RELATION} #{event.relation.id}: {error}" ) - self.charm.unit.status = BlockedStatus(LOGICAL_REPLICATION_VALIDATION_ERROR_STATUS) + self.charm.set_unit_status(BlockedStatus(LOGICAL_REPLICATION_VALIDATION_ERROR_STATUS)) secret_content = self.model.get_secret( id=event.relation.data[event.app]["secret-id"] @@ -427,7 +427,7 @@ def _fail_validation(self, message: str | None = None) -> bool: if message: logger.error(f"Logical replication validation: {message}") self.charm.app_peer_data["logical-replication-validation"] = "error" - self.charm.unit.status = BlockedStatus(LOGICAL_REPLICATION_VALIDATION_ERROR_STATUS) + self.charm.set_unit_status(BlockedStatus(LOGICAL_REPLICATION_VALIDATION_ERROR_STATUS)) return False def _validate_new_publication( diff --git a/src/relations/postgresql_provider.py b/src/relations/postgresql_provider.py index aa5de5970f..0ee4e7e1a0 100644 --- a/src/relations/postgresql_provider.py +++ b/src/relations/postgresql_provider.py @@ -117,10 +117,10 @@ def _get_custom_credentials( password = val break if user in SYSTEM_USERS or user in self.charm.postgresql.list_users(): - self.charm.unit.status = BlockedStatus(FORBIDDEN_USER_MSG) + self.charm.set_unit_status(BlockedStatus(FORBIDDEN_USER_MSG)) return except ModelError: - self.charm.unit.status = BlockedStatus(NO_ACCESS_TO_SECRET_MSG) + self.charm.set_unit_status(BlockedStatus(NO_ACCESS_TO_SECRET_MSG)) return return user, password @@ -222,14 +222,16 @@ def _on_database_requested(self, event: DatabaseRequestedEvent) -> None: PostgreSQLGetPostgreSQLVersionError, ) as e: logger.exception(e) - self.charm.unit.status = BlockedStatus( - e.message - if ( - issubclass(type(e), PostgreSQLCreateDatabaseError) - or issubclass(type(e), PostgreSQLCreateUserError) + self.charm.set_unit_status( + BlockedStatus( + e.message + if ( + issubclass(type(e), PostgreSQLCreateDatabaseError) + or issubclass(type(e), PostgreSQLCreateUserError) + ) + and e.message is not None + else f"Failed to initialize {self.relation_name} relation" ) - and e.message is not None - else f"Failed to initialize {self.relation_name} relation" ) return @@ -277,8 +279,10 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None: self.charm.postgresql.delete_user(user) except PostgreSQLDeleteUserError as e: logger.exception(e) - self.charm.unit.status = BlockedStatus( - f"Failed to delete user during {self.relation_name} relation broken event" + self.charm.set_unit_status( + BlockedStatus( + f"Failed to delete user during {self.relation_name} relation broken event" + ) ) self.update_username_mapping(event.relation.id, None) @@ -368,19 +372,19 @@ def _update_unit_status(self, relation: Relation) -> None: and not self.check_for_invalid_extra_user_roles(relation.id) and not self.check_for_invalid_database_name(relation.id) ): - self.charm.unit.status = ActiveStatus() + self.charm.set_unit_status(ActiveStatus()) if ( self.charm._has_blocked_status and "Failed to initialize relation" in self.charm.unit.status.message ): - self.charm.unit.status = ActiveStatus() + self.charm.set_unit_status(ActiveStatus()) if self.charm._has_blocked_status and self.charm.unit.status.message in [ INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE, NO_ACCESS_TO_SECRET_MSG, FORBIDDEN_USER_MSG, ]: if self.check_for_invalid_extra_user_roles(relation.id): - self.charm.unit.status = BlockedStatus(INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE) + self.charm.set_unit_status(BlockedStatus(INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE)) return existing_users = self.charm.postgresql.list_users() for relation in self.charm.model.relations.get(self.relation_name, []): @@ -401,13 +405,13 @@ def _update_unit_status(self, relation: Relation) -> None: logger.warning( f"Relation {relation.id} is still requesting a forbidden user" ) - self.charm.unit.status = BlockedStatus(FORBIDDEN_USER_MSG) + self.charm.set_unit_status(BlockedStatus(FORBIDDEN_USER_MSG)) return except ModelError: logger.warning(f"Relation {relation.id} still cannot access the set secret") - self.charm.unit.status = BlockedStatus(NO_ACCESS_TO_SECRET_MSG) + self.charm.set_unit_status(BlockedStatus(NO_ACCESS_TO_SECRET_MSG)) return - self.charm.unit.status = ActiveStatus() + self.charm.set_unit_status(ActiveStatus()) def check_for_invalid_extra_user_roles(self, relation_id: int) -> bool: """Checks if there are relations with invalid extra user roles. diff --git a/src/upgrade.py b/src/upgrade.py deleted file mode 100644 index 36c80cfaca..0000000000 --- a/src/upgrade.py +++ /dev/null @@ -1,307 +0,0 @@ -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Upgrades implementation.""" - -import json -import logging -from typing import override - -from charms.data_platform_libs.v0.upgrade import ( - ClusterNotReadyError, - DataUpgrade, - DependencyModel, - KubernetesClientError, -) -from lightkube.core.client import Client -from lightkube.core.exceptions import ApiError -from lightkube.resources.apps_v1 import StatefulSet -from ops.charm import UpgradeCharmEvent, WorkloadEvent -from ops.model import BlockedStatus, MaintenanceStatus, RelationDataContent -from pydantic.v1 import BaseModel -from single_kernel_postgresql.utils.postgresql import ACCESS_GROUPS -from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed - -from constants import APP_SCOPE, MONITORING_PASSWORD_KEY, MONITORING_USER, PATRONI_PASSWORD_KEY -from patroni import SwitchoverFailedError -from utils import new_password - -logger = logging.getLogger(__name__) - - -class PostgreSQLDependencyModel(BaseModel): - """PostgreSQL dependencies model.""" - - charm: DependencyModel - rock: DependencyModel - - -def get_postgresql_k8s_dependencies_model() -> PostgreSQLDependencyModel: - """Return the PostgreSQL dependencies model.""" - with open("src/dependency.json") as dependency_file: - _deps = json.load(dependency_file) - return PostgreSQLDependencyModel(**_deps) - - -class PostgreSQLUpgrade(DataUpgrade): - """PostgreSQL upgrade class.""" - - def __init__(self, charm, model: BaseModel, **kwargs) -> None: - """Initialize the class.""" - super().__init__(charm, model, **kwargs) - self.charm = charm - - self.framework.observe(self.charm.on.upgrade_relation_changed, self._on_upgrade_changed) - self.framework.observe( - self.charm.on.postgresql_pebble_ready, self._on_postgresql_pebble_ready - ) - self.framework.observe(self.charm.on.upgrade_charm, self._on_upgrade_charm_check_legacy) - - def _handle_label_change(self) -> None: - """Handle the label change from `master` to `primary`.""" - unit_number = int(self.charm.unit.name.split("/")[1]) - if unit_number == 1: - # If the unit is the last to be upgraded before unit zero, - # trigger a switchover, so one of the upgraded units becomes - # the primary. - try: - self.charm._patroni.switchover() - except SwitchoverFailedError as e: - logger.warning(f"Switchover failed: {e}") - if len(self.charm._peers.units) == 0 or unit_number == 1: - # If the unit is the last to be upgraded before unit zero - # or the only unit in the cluster, update the label. - self.charm._create_services() - - @property - def is_no_sync_member(self) -> bool: - """Whether this member shouldn't be a synchronous standby (when it's a replica).""" - if not self.peer_relation: - return False - - sync_standbys = self.peer_relation.data[self.charm.app].get("sync-standbys") - if sync_standbys is None: - return False - return self.charm.unit.name not in json.loads(sync_standbys) - - @override - def log_rollback_instructions(self) -> None: - """Log rollback instructions.""" - logger.info( - "Run `juju refresh --revision postgresql-k8s` to initiate the rollback" - ) - logger.info( - "and `juju run-action postgresql-k8s/leader resume-upgrade` to resume the rollback" - ) - - def _on_postgresql_pebble_ready(self, event: WorkloadEvent) -> None: - """Handle pebble ready event. - - Confirm that unit is healthy and set unit completed. - """ - if not self.peer_relation: - logger.debug("Deferring on_pebble_ready: no upgrade peer relation yet") - event.defer() - return - - if self.state not in ["upgrading", "recovery"]: - return - - # Don't mark the upgrade of this unit as completed until Patroni reports the - # workload is ready. - if not self.charm._patroni.member_started: - logger.debug("Deferring on_pebble_ready: Patroni has not started yet") - event.defer() - return - - if self.charm.unit.is_leader(): - if not self.charm._patroni.primary_endpoint_ready: - logger.debug( - "Deferring on_pebble_ready: current unit is leader but primary endpoint is not ready yet" - ) - event.defer() - return - self._set_up_new_credentials_for_legacy() - self._set_up_new_access_roles_for_legacy() - - try: - for attempt in Retrying(stop=stop_after_attempt(6), wait=wait_fixed(10)): - with attempt: - if ( - self.charm.unit.name.replace("/", "-") - in self.charm._patroni.cluster_members - and self.charm._patroni.is_replication_healthy - ): - self._handle_label_change() - logger.debug("Upgraded unit is healthy. Set upgrade state to `completed`") - self.set_unit_completed() - else: - logger.debug( - "Instance not yet back in the cluster or not healthy." - f" Retry {attempt.retry_state.attempt_number}/6" - ) - raise Exception - except RetryError: - logger.error("Upgraded unit is not part of the cluster or not healthy") - self.set_unit_failed() - self.charm.unit.status = BlockedStatus( - "upgrade failed. Check logs for rollback instruction" - ) - - def _on_upgrade_changed(self, event) -> None: - """Update the Patroni nosync tag in the unit if needed.""" - if not self.peer_relation or not self.charm._patroni.member_started: - return - - self.charm.update_config() - self.charm.updated_synchronous_node_count() - - def _on_upgrade_charm_check_legacy(self, event: UpgradeCharmEvent) -> None: - if not self.peer_relation: - logger.debug("Wait all units join the upgrade relation") - return - - if self.state: - # Do nothing - if state set, upgrade is supported - return - - logger.warning("Upgrading from unspecified version") - - # All peers should set the state to upgrading. - self.unit_upgrade_data.update({"state": "upgrading"}) - - if self.charm.unit.name != f"{self.charm.app.name}/{self.charm.app.planned_units() - 1}": - self.charm.unit.status = MaintenanceStatus("upgrading unit") - self.peer_relation.data[self.charm.unit].update({"state": "upgrading"}) - self._set_rolling_update_partition(self.charm.app.planned_units()) - - @override - def pre_upgrade_check(self) -> None: - """Runs necessary checks validating the cluster is in a healthy state to upgrade. - - Called by all units during :meth:`_on_pre_upgrade_check_action`. - - Raises: - :class:`ClusterNotReadyError`: if cluster is not ready to upgrade - """ - default_message = "Pre-upgrade check failed and cannot safely upgrade" - if not self.charm._patroni.are_all_members_ready(): - raise ClusterNotReadyError( - default_message, - "not all members are ready yet", - "wait for all units to become active/idle", - ) - - if self.charm._patroni.is_creating_backup: - raise ClusterNotReadyError( - default_message, - "a backup is being created", - "wait for the backup creation to finish before starting the upgrade", - ) - - # If the first unit is already the primary we don't need to do any - # switchover. - primary_unit_name = self.charm._patroni.get_primary(unit_name_pattern=True) - unit_zero_name = f"{self.charm.app.name}/0" - if primary_unit_name == unit_zero_name: - # Should be replaced with refresh v3 - self.peer_relation.data[self.charm.app].update({"sync-standbys": ""}) # type: ignore - self._set_first_rolling_update_partition() - return - - sync_standby_names = self.charm._patroni.get_sync_standby_names() - if len(sync_standby_names) == 0: - raise ClusterNotReadyError("invalid number of sync nodes", "no action!") - - # If the first unit is a sync-standby we can switchover to it. - if unit_zero_name in sync_standby_names: - try: - # Should be replaced with refresh v3 - self.peer_relation.data[self.charm.app].update({"sync-standbys": ""}) # type: ignore - self.charm._patroni.switchover(unit_zero_name) - except SwitchoverFailedError as e: - raise ClusterNotReadyError( - str(e), f"try to switchover manually to {unit_zero_name}" - ) from e - self._set_first_rolling_update_partition() - return - - # If the first unit is not one of the sync-standbys, make it one and request - # the action to be executed again (because relation data need to be propagated - # to the other units to make some of them simple replicas and enable the fist - # unit to become a sync-standby before we can trigger a switchover to it). - self._set_list_of_sync_standbys() - cause = f"{unit_zero_name} needs to be a synchronous standby in order to become the primary before the upgrade process can start" - resolution = f"wait 30 seconds for {unit_zero_name} to become a synchronous standby and run this action again" - action_message = f"{cause} - {resolution}" - raise ClusterNotReadyError(action_message, cause, resolution) - - def _set_list_of_sync_standbys(self) -> None: - """Set the list of desired sync-standbys in the relation data.""" - if self.charm.app.planned_units() > 2: - sync_standbys = self.charm._patroni.get_sync_standby_names() - # Include the first unit as one of the sync-standbys. - unit_to_become_sync_standby = f"{self.charm.app.name}/0" - if unit_to_become_sync_standby not in set(sync_standbys): - if len(sync_standbys) > 0: - sync_standbys.pop() - sync_standbys.append(unit_to_become_sync_standby) - # Should be replaced with refresh v3 - self.peer_relation.data[self.charm.app].update({ # type: ignore - "sync-standbys": json.dumps(sync_standbys) - }) - logger.debug(f"sync-standbys changed to: {sync_standbys}") - - @override - def _set_rolling_update_partition(self, partition: int) -> None: - """Set the rolling update partition to a specific value.""" - try: - patch = {"spec": {"updateStrategy": {"rollingUpdate": {"partition": partition}}}} - Client().patch( - StatefulSet, - name=self.charm.model.app.name, - namespace=self.charm.model.name, - obj=patch, - ) - logger.debug(f"Kubernetes StatefulSet partition set to {partition}") - except ApiError as e: - cause = "`juju trust` needed" if e.status.code == 403 else str(e) - raise KubernetesClientError("Kubernetes StatefulSet patch failed", cause) from e - - def _set_first_rolling_update_partition(self) -> None: - """Set the initial rolling update partition value.""" - try: - self._set_rolling_update_partition(self.charm.app.planned_units() - 1) - except KubernetesClientError as e: - raise ClusterNotReadyError(e.message, e.cause) from e - - def _set_up_new_access_roles_for_legacy(self) -> None: - """Create missing access groups and their memberships.""" - access_groups = self.charm.postgresql.list_access_groups() - if access_groups == set(ACCESS_GROUPS) and sorted( - self.charm.postgresql.list_users_from_relation() - ) == sorted(self.charm.postgresql.list_users(group="relation_access")): - return - - self.charm.postgresql.create_access_groups() - self.charm.postgresql.grant_internal_access_group_memberships() - self.charm.postgresql.grant_relation_access_group_memberships() - - def _set_up_new_credentials_for_legacy(self) -> None: - """Create missing password and user.""" - for key in (MONITORING_PASSWORD_KEY, PATRONI_PASSWORD_KEY): - if self.charm.get_secret(APP_SCOPE, key) is None: - self.charm.set_secret(APP_SCOPE, key, new_password()) - users = self.charm.postgresql.list_users() - if MONITORING_USER not in users: - self.charm.postgresql.create_user( - MONITORING_USER, - self.charm.get_secret(APP_SCOPE, MONITORING_PASSWORD_KEY), - extra_user_roles="pg_monitor", - ) - - @property - def unit_upgrade_data(self) -> RelationDataContent: - """Return the application upgrade data.""" - # Should be replaced with refresh v3 - return self.peer_relation.data[self.charm.unit] # type: ignore diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 60be4f8c6e..1c850986f2 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -9,9 +9,7 @@ import subprocess import tarfile import tempfile -import zipfile from datetime import datetime -from pathlib import Path import kubernetes as kubernetes import psycopg2 @@ -49,7 +47,6 @@ run_command_on_unit, ) from ..juju_ import juju_major_version -from ..new_relations.helpers import get_application_relation_data PORT = 5432 @@ -564,23 +561,6 @@ async def get_sync_standby(model: Model, application_name: str) -> str: return member["name"] -async def inject_dependency_fault( - ops_test: OpsTest, application_name: str, charm_file: str | Path -) -> None: - """Inject a dependency fault into the PostgreSQL charm.""" - # Query running dependency to overwrite with incompatible version. - dependencies = await get_application_relation_data( - ops_test, application_name, "upgrade", "dependencies" - ) - loaded_dependency_dict = json.loads(dependencies) - loaded_dependency_dict["charm"]["upgrade_supported"] = "^25" - loaded_dependency_dict["charm"]["version"] = "25.0" - - # Overwrite dependency.json with incompatible version. - with zipfile.ZipFile(charm_file, mode="a") as charm_zip: - charm_zip.writestr("src/dependency.json", json.dumps(loaded_dependency_dict)) - - async def is_connection_possible(ops_test: OpsTest, unit_name: str) -> bool: """Test a connection to a PostgreSQL server.""" try: diff --git a/tests/integration/ha_tests/test_upgrade.py b/tests/integration/ha_tests/test_upgrade.py index f074cae750..8ca9386493 100644 --- a/tests/integration/ha_tests/test_upgrade.py +++ b/tests/integration/ha_tests/test_upgrade.py @@ -4,29 +4,28 @@ import asyncio import logging import shutil +import zipfile from pathlib import Path +from time import sleep import pytest -from lightkube import Client -from lightkube.resources.apps_v1 import StatefulSet +import tomli +import tomli_w from pytest_operator.plugin import OpsTest -from tenacity import Retrying, stop_after_attempt, wait_fixed from ..helpers import ( APPLICATION_NAME, - CHARM_BASE, CHARM_BASE_NOBLE, DATABASE_APP_NAME, METADATA, count_switchovers, get_leader_unit, get_primary, - get_unit_by_index, + switchover_to_unit_zero, ) from .helpers import ( are_writes_increasing, check_writes, - inject_dependency_fault, start_continuous_writes, ) @@ -39,19 +38,33 @@ async def test_deploy_latest(ops_test: OpsTest) -> None: """Simple test to ensure that the PostgreSQL and application charms get deployed.""" await asyncio.gather( - ops_test.model.deploy( + # TODO: remove call to ops_test.juju and uncomment call to ops_test.model.deploy. + ops_test.juju( + "deploy", DATABASE_APP_NAME, - num_units=3, - channel="16/edge", - trust=True, - config={"profile": "testing"}, - base=CHARM_BASE_NOBLE, + "-n", + 3, + "--channel", + "16/edge/neppel", + "--trust", + "--config", + "profile=testing", + "--base", + CHARM_BASE_NOBLE, ), + # ops_test.model.deploy( + # DATABASE_APP_NAME, + # num_units=3, + # channel="16/edge", + # trust=True, + # config={"profile": "testing"}, + # base=CHARM_BASE_NOBLE, + # ), ops_test.model.deploy( APPLICATION_NAME, num_units=1, channel="latest/edge", - base=CHARM_BASE, + config={"sleep_interval": 500}, ), ) await ops_test.model.relate(DATABASE_APP_NAME, f"{APPLICATION_NAME}:database") @@ -66,29 +79,17 @@ async def test_deploy_latest(ops_test: OpsTest) -> None: @pytest.mark.abort_on_fail -async def test_pre_upgrade_check(ops_test: OpsTest) -> None: - """Test that the pre-upgrade-check action runs successfully.""" +async def test_pre_refresh_check(ops_test: OpsTest) -> None: + """Test that the pre-refresh-check action runs successfully.""" logger.info("Get leader unit") leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) assert leader_unit is not None, "No leader unit found" - for attempt in Retrying(stop=stop_after_attempt(2), wait=wait_fixed(30), reraise=True): - with attempt: - logger.info("Run pre-upgrade-check action") - action = await leader_unit.run_action("pre-upgrade-check") - await action.wait() - - # Ensure the primary has changed to the first unit. - primary_name = await get_primary(ops_test, DATABASE_APP_NAME) - assert primary_name == f"{DATABASE_APP_NAME}/0", "Primary unit not set to unit 0" - - logger.info("Assert partition is set to 2") - client = Client() - stateful_set = client.get( - res=StatefulSet, namespace=ops_test.model.info.name, name=DATABASE_APP_NAME - ) + logger.info("Run pre-refresh-check action") + action = await leader_unit.run_action("pre-refresh-check") + await action.wait() - assert stateful_set.spec.updateStrategy.rollingUpdate.partition == 2, "Partition not set to 2" + await switchover_to_unit_zero(ops_test) @pytest.mark.abort_on_fail @@ -110,26 +111,47 @@ async def test_upgrade_from_edge(ops_test: OpsTest, charm, continuous_writes) -> logger.info("Refresh the charm") await application.refresh(path=charm, resources=resources) - logger.info("Wait for upgrade to complete on first upgrading unit") - # highest ordinal unit always the first to upgrade - unit = get_unit_by_index(DATABASE_APP_NAME, application.units, 2) + logger.info("Wait for upgrade to start") + await ops_test.model.block_until(lambda: application.status == "blocked", timeout=60 * 3) + logger.info("Wait for refresh to block as paused or incompatible") async with ops_test.fast_forward("60s"): - await ops_test.model.block_until( - lambda: unit.workload_status_message == "upgrade completed", timeout=TIMEOUT - ) await ops_test.model.wait_for_idle( apps=[DATABASE_APP_NAME], idle_period=30, timeout=TIMEOUT ) - # Check whether writes are increasing. - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) + # Highest to lowest unit number + refresh_order = sorted( + application.units, key=lambda unit: int(unit.name.split("/")[1]), reverse=True + ) + + if "Refresh incompatible" in refresh_order[0].workload_status_message: + logger.info("Application refresh is blocked due to incompatibility") + + action = await refresh_order[0].run_action( + "force-refresh-start", **{"check-compatibility": False} + ) + await action.wait() + + logger.info("Wait for first incompatible unit to upgrade") + async with ops_test.fast_forward("60s"): + await ops_test.model.wait_for_idle( + apps=[DATABASE_APP_NAME], idle_period=30, timeout=TIMEOUT + ) + else: + async with ops_test.fast_forward("60s"): + await ops_test.model.block_until( + lambda: all(unit.workload_status == "active" for unit in application.units), + timeout=60 * 3, + ) + + sleep(60) - logger.info("Resume upgrade") leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - action = await leader_unit.run_action("resume-upgrade") + logger.info(f"Run resume-refresh action on {leader_unit.name}") + action = await leader_unit.run_action("resume-refresh") await action.wait() + logger.info(f"Results from the action: {action.results}") logger.info("Wait for upgrade to complete") async with ops_test.fast_forward("60s"): @@ -167,69 +189,67 @@ async def test_fail_and_rollback(ops_test, charm, continuous_writes) -> None: leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) assert leader_unit is not None, "No leader unit found" - for attempt in Retrying(stop=stop_after_attempt(2), wait=wait_fixed(30), reraise=True): - with attempt: - logger.info("Run pre-upgrade-check action") - action = await leader_unit.run_action("pre-upgrade-check") - await action.wait() + logger.info("Run pre-refresh-check action") + + action = await leader_unit.run_action("pre-refresh-check") + await action.wait() - # Ensure the primary has changed to the first unit. - primary_name = await get_primary(ops_test, DATABASE_APP_NAME) - assert primary_name == f"{DATABASE_APP_NAME}/0" + await switchover_to_unit_zero(ops_test) filename = Path(charm).name - fault_charm = Path("/tmp/", filename) + fault_charm = Path("/tmp", f"{filename}.fault.charm") shutil.copy(charm, fault_charm) logger.info("Inject dependency fault") - await inject_dependency_fault(ops_test, DATABASE_APP_NAME, fault_charm) + await inject_dependency_fault(fault_charm) application = ops_test.model.applications[DATABASE_APP_NAME] logger.info("Refresh the charm") await application.refresh(path=fault_charm) - logger.info("Get first upgrading unit") - # Highest ordinal unit always the first to upgrade. - unit = get_unit_by_index(DATABASE_APP_NAME, application.units, 2) + logger.info("Wait for upgrade to fail") - logger.info("Wait for upgrade to fail on first upgrading unit") - async with ops_test.fast_forward("60s"): - await ops_test.model.block_until( - lambda: unit.workload_status == "blocked", - timeout=TIMEOUT, - ) + # Highest to lowest unit number + refresh_order = sorted( + application.units, key=lambda unit: int(unit.name.split("/")[1]), reverse=True + ) + + await ops_test.model.block_until( + lambda: application.status == "blocked" + and "Refresh incompatible" in refresh_order[0].workload_status_message, + timeout=TIMEOUT, + ) logger.info("Ensure continuous_writes while in failure state on remaining units") await are_writes_increasing(ops_test) - logger.info("Re-run pre-upgrade-check action") - action = await leader_unit.run_action("pre-upgrade-check") - await action.wait() - logger.info("Re-refresh the charm") await application.refresh(path=charm) + logger.info("Wait for upgrade to start") + + await ops_test.model.block_until(lambda: application.status == "blocked", timeout=TIMEOUT) + + logger.info("Wait for application to recover") async with ops_test.fast_forward("60s"): await ops_test.model.block_until( - lambda: unit.workload_status_message == "upgrade completed", timeout=TIMEOUT - ) - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], idle_period=30, timeout=TIMEOUT + lambda: all(unit.workload_status == "active" for unit in application.units), + timeout=60 * 3, ) - # Check whether writes are increasing. - logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test) + sleep(60) - logger.info("Resume upgrade") - action = await leader_unit.run_action("resume-upgrade") + leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) + logger.info(f"Run resume-refresh action on {leader_unit.name}") + action = await leader_unit.run_action("resume-refresh") await action.wait() + logger.info(f"Results from the action: {action.results}") - logger.info("Wait for application to recover") + logger.info("Wait for upgrade to complete") async with ops_test.fast_forward("60s"): await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", timeout=TIMEOUT + apps=[DATABASE_APP_NAME], status="active", idle_period=30, timeout=TIMEOUT ) logger.info("Ensure continuous_writes after rollback procedure") @@ -242,3 +262,16 @@ async def test_fail_and_rollback(ops_test, charm, continuous_writes) -> None: # Remove fault charm file. fault_charm.unlink() + + +async def inject_dependency_fault(charm_file: str | Path) -> None: + """Inject a dependency fault into the PostgreSQL charm.""" + with Path("refresh_versions.toml").open("rb") as file: + versions = tomli.load(file) + + versions["charm"] = "16/0.0.0" + versions["workload"] = "16.10" + + # Overwrite refresh_versions.toml with incompatible version. + with zipfile.ZipFile(charm_file, mode="a") as charm_zip: + charm_zip.writestr("refresh_versions.toml", tomli_w.dumps(versions)) diff --git a/tests/integration/ha_tests/test_upgrade_from_stable.py b/tests/integration/ha_tests/test_upgrade_from_stable.py index 86c012d1f7..cf90d36577 100644 --- a/tests/integration/ha_tests/test_upgrade_from_stable.py +++ b/tests/integration/ha_tests/test_upgrade_from_stable.py @@ -3,23 +3,20 @@ import asyncio import logging +from time import sleep import pytest -from lightkube import Client -from lightkube.resources.apps_v1 import StatefulSet from pytest_operator.plugin import OpsTest -from tenacity import Retrying, stop_after_attempt, wait_fixed from ..helpers import ( APPLICATION_NAME, - CHARM_BASE, CHARM_BASE_NOBLE, DATABASE_APP_NAME, METADATA, count_switchovers, get_leader_unit, get_primary, - get_unit_by_index, + switchover_to_unit_zero, ) from .helpers import ( are_writes_increasing, @@ -36,60 +33,56 @@ async def test_deploy_stable(ops_test: OpsTest) -> None: """Simple test to ensure that the PostgreSQL and application charms get deployed.""" await asyncio.gather( - ops_test.model.deploy( + # TODO: remove call to ops_test.juju and uncomment call to ops_test.model.deploy. + ops_test.juju( + "deploy", DATABASE_APP_NAME, - num_units=3, - # TODO: move to stable once we release. - channel="16/edge", - trust=True, - config={"profile": "testing"}, - base=CHARM_BASE_NOBLE, + "-n", + 3, + "--channel", + "16/edge/neppel", + "--trust", + "--config", + "profile=testing", + "--base", + CHARM_BASE_NOBLE, ), + # ops_test.model.deploy( + # DATABASE_APP_NAME, + # num_units=3, + # channel="16/stable", + # trust=True, + # config={"profile": "testing"}, + # base=CHARM_BASE_NOBLE, + # ), ops_test.model.deploy( APPLICATION_NAME, num_units=1, channel="latest/edge", - base=CHARM_BASE, + config={"sleep_interval": 500}, ), ) await ops_test.model.relate(DATABASE_APP_NAME, f"{APPLICATION_NAME}:database") logger.info("Wait for applications to become active") async with ops_test.fast_forward(): await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active" + apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active", timeout=(20 * 60) ) assert len(ops_test.model.applications[DATABASE_APP_NAME].units) == 3 @pytest.mark.abort_on_fail -async def test_pre_upgrade_check(ops_test: OpsTest) -> None: - """Test that the pre-upgrade-check action runs successfully.""" - application = ops_test.model.applications[DATABASE_APP_NAME] - if "pre-upgrade-check" not in await application.get_actions(): - logger.info("skipping the test because the charm from 14/stable doesn't support upgrade") - return - +async def test_pre_refresh_check(ops_test: OpsTest) -> None: + """Test that the pre-refresh-check action runs successfully.""" logger.info("Get leader unit") leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) assert leader_unit is not None, "No leader unit found" - for attempt in Retrying(stop=stop_after_attempt(2), wait=wait_fixed(30), reraise=True): - with attempt: - logger.info("Run pre-upgrade-check action") - action = await leader_unit.run_action("pre-upgrade-check") - await action.wait() + logger.info("Run pre-refresh-check action") + action = await leader_unit.run_action("pre-refresh-check") + await action.wait() - # Ensure the primary has changed to the first unit. - primary_name = await get_primary(ops_test, DATABASE_APP_NAME) - assert primary_name == f"{DATABASE_APP_NAME}/0", "Primary unit not set to unit 0" - - logger.info("Assert partition is set to 2") - client = Client() - stateful_set = client.get( - res=StatefulSet, namespace=ops_test.model.info.name, name=DATABASE_APP_NAME - ) - - assert stateful_set.spec.updateStrategy.rollingUpdate.partition == 2, "Partition not set to 2" + await switchover_to_unit_zero(ops_test) @pytest.mark.abort_on_fail @@ -108,29 +101,52 @@ async def test_upgrade_from_stable(ops_test: OpsTest, charm): resources = {"postgresql-image": METADATA["resources"]["postgresql-image"]["upstream-source"]} application = ops_test.model.applications[DATABASE_APP_NAME] - actions = await application.get_actions() logger.info("Refresh the charm") await application.refresh(path=charm, resources=resources) - logger.info("Wait for upgrade to complete on first upgrading unit") - # Highest ordinal unit always the first to upgrade. - unit = get_unit_by_index(DATABASE_APP_NAME, application.units, 2) + logger.info("Wait for upgrade to start") + await ops_test.model.block_until(lambda: application.status == "blocked", timeout=60 * 3) + logger.info("Wait for refresh to block as paused or incompatible") async with ops_test.fast_forward("60s"): - await ops_test.model.block_until( - lambda: unit.workload_status_message == "upgrade completed", timeout=TIMEOUT - ) await ops_test.model.wait_for_idle( apps=[DATABASE_APP_NAME], idle_period=30, timeout=TIMEOUT ) - if "resume-upgrade" in actions: - logger.info("Resume upgrade") - leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) - action = await leader_unit.run_action("resume-upgrade") + # Highest to lowest unit number + refresh_order = sorted( + application.units, key=lambda unit: int(unit.name.split("/")[1]), reverse=True + ) + + if "Refresh incompatible" in application.status_message: + logger.info("Application refresh is blocked due to incompatibility") + + action = await refresh_order[0].run_action( + "force-refresh-start", **{"check-compatibility": False} + ) await action.wait() + logger.info("Wait for first incompatible unit to upgrade") + async with ops_test.fast_forward("60s"): + await ops_test.model.wait_for_idle( + apps=[DATABASE_APP_NAME], idle_period=30, timeout=TIMEOUT + ) + else: + async with ops_test.fast_forward("60s"): + await ops_test.model.block_until( + lambda: all(unit.workload_status == "active" for unit in application.units), + timeout=60 * 3, + ) + + sleep(60) + + leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) + logger.info(f"Run resume-refresh action on {leader_unit.name}") + action = await leader_unit.run_action("resume-refresh") + await action.wait() + logger.info(f"Results from the action: {action.results}") + logger.info("Wait for upgrade to complete") async with ops_test.fast_forward("60s"): await ops_test.model.wait_for_idle( @@ -147,9 +163,8 @@ async def test_upgrade_from_stable(ops_test: OpsTest, charm): await check_writes(ops_test) # Check the number of switchovers. - if "pre-upgrade-check" in actions: - logger.info("checking the number of switchovers") - final_number_of_switchovers = await count_switchovers(ops_test, primary_name) - assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( - "Number of switchovers is greater than 2" - ) + logger.info("checking the number of switchovers") + final_number_of_switchovers = await count_switchovers(ops_test, primary_name) + assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( + "Number of switchovers is greater than 2" + ) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 93f5643326..f4367fd151 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -5,6 +5,7 @@ import itertools import json import logging +from asyncio import sleep from datetime import datetime from multiprocessing import ProcessError from pathlib import Path @@ -820,6 +821,22 @@ async def switchover( assert standbys == len(ops_test.model.applications[app_name].units) - 1 +async def switchover_to_unit_zero(ops_test: OpsTest) -> None: + primary_name = await get_primary(ops_test, DATABASE_APP_NAME) + expected_primary_name = f"{DATABASE_APP_NAME}/0" + if primary_name != expected_primary_name: + logger.info(f"Switching primary to {expected_primary_name}") + action = await ops_test.model.units[expected_primary_name].run_action( + "promote-to-primary", scope="unit" + ) + await action.wait() + + await sleep(30) + + primary_name = await get_primary(ops_test, DATABASE_APP_NAME) + assert primary_name == expected_primary_name, "Primary unit not set to unit 0" + + async def wait_for_idle_on_blocked( ops_test: OpsTest, database_app_name: str, diff --git a/tests/integration/test_trust.py b/tests/integration/test_trust.py index fe41f85c06..220d9e9e7f 100644 --- a/tests/integration/test_trust.py +++ b/tests/integration/test_trust.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. - +import asyncio import logging import pytest @@ -10,13 +10,14 @@ from .helpers import ( CHARM_BASE, METADATA, - get_leader_unit, ) logger = logging.getLogger(__name__) APP_NAME = "untrusted-postgresql-k8s" -UNTRUST_ERROR_MESSAGE = f"Insufficient permissions, try: `juju trust {APP_NAME} --scope=cluster`" +UNTRUST_ERROR_MESSAGE = ( + f"Run `juju trust {APP_NAME} --scope=cluster`. Needed for in-place refreshes" +) @pytest.mark.abort_on_fail @@ -36,18 +37,17 @@ async def test_deploy_without_trust(ops_test: OpsTest, charm): base=CHARM_BASE, ) - await ops_test.model.block_until( - lambda: any( - unit.workload_status == "blocked" - for unit in ops_test.model.applications[APP_NAME].units + logger.info("Waiting for charm to become blocked due to missing --trust") + await asyncio.gather( + ops_test.model.block_until( + lambda: ops_test.model.applications[APP_NAME].status == "blocked", timeout=1000 + ), + ops_test.model.block_until( + lambda: ops_test.model.applications[APP_NAME].status_message == UNTRUST_ERROR_MESSAGE, + timeout=1000, ), - timeout=1000, ) - leader_unit = await get_leader_unit(ops_test, APP_NAME) - assert leader_unit.workload_status == "blocked" - assert leader_unit.workload_status_message == UNTRUST_ERROR_MESSAGE - async def test_trust_blocked_deployment(ops_test: OpsTest): """Trust existing blocked deployment. @@ -56,4 +56,4 @@ async def test_trust_blocked_deployment(ops_test: OpsTest): """ await ops_test.juju("trust", APP_NAME, "--scope=cluster") - await ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=1000) + await ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=3000) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index b571899387..07ef567a6d 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -1,6 +1,6 @@ # Copyright 2023 Canonical Ltd. # See LICENSE file for licensing details. -from unittest.mock import PropertyMock +from unittest.mock import Mock, PropertyMock, patch import pytest from charms.tempo_coordinator_k8s.v0.charm_tracing import charm_tracing_disabled @@ -17,3 +17,28 @@ def juju_has_secrets(request, monkeypatch): def disable_charm_tracing(): with charm_tracing_disabled(): yield + + +@pytest.fixture(autouse=True) +def mock_refresh(): + """Fixture to shunt refresh logic and events.""" + refresh_mock = Mock() + refresh_mock.in_progress = False + refresh_mock.app_status_higher_priority = None + refresh_mock.app_status_lower_priority.return_value = None + refresh_mock.unit_status_higher_priority = None + refresh_mock.unit_status_lower_priority.return_value = None + refresh_mock.next_unit_allowed_to_refresh = True + refresh_mock.workload_allowed_to_start = True + + # Mock the _RefreshVersions class to avoid KeyError when charm key is missing + versions_mock = Mock() + versions_mock.charm = "v1/16.0.0" + versions_mock.workload = "16.10" + + with ( + patch("charm_refresh.Kubernetes", Mock(return_value=refresh_mock)), + patch("charm.PostgreSQLRefresh", Mock(return_value=None)), + patch("charm_refresh._main._RefreshVersions", Mock(return_value=versions_mock)), + ): + yield diff --git a/tests/unit/test_arch_utils.py b/tests/unit/test_arch_utils.py index f655f28c5a..7285448085 100644 --- a/tests/unit/test_arch_utils.py +++ b/tests/unit/test_arch_utils.py @@ -25,6 +25,7 @@ def test_on_module_not_found_error(monkeypatch): # If psycopg2 not there, charm should check architecture monkeypatch.delitem(sys.modules, "psycopg2", raising=False) monkeypatch.delitem(sys.modules, "charm", raising=False) + monkeypatch.delitem(sys.modules, "charm_refresh", raising=False) monkeypatch.setattr(builtins, "__import__", psycopg2_not_found) with pytest.raises(ModuleNotFoundError): import charm diff --git a/tests/unit/test_async_replication.py b/tests/unit/test_async_replication.py index 689a1cb9f5..0ad84ab111 100644 --- a/tests/unit/test_async_replication.py +++ b/tests/unit/test_async_replication.py @@ -49,7 +49,7 @@ def test_on_async_relation_broken(harness, is_leader, relation_name): with ( patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, patch( - "relations.async_replication.PostgreSQLAsyncReplication._set_app_status" + "relations.async_replication.PostgreSQLAsyncReplication.set_app_status" ) as _set_app_status, patch("charm.Patroni.get_standby_leader") as _get_standby_leader, patch( diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index d8e876c88f..063ba74024 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -1894,6 +1894,12 @@ def test_retrieve_s3_parameters( def test_start_stop_pgbackrest_service(harness): + # Enable Pebble connectivity + harness.set_can_connect("postgresql", True) + + # Get the container to set up pebble mocking + container = harness.model.unit.get_container("postgresql") + with ( patch( "charm.PostgreSQLBackups._is_primary_pgbackrest_service_running", @@ -1915,6 +1921,7 @@ def test_start_stop_pgbackrest_service(harness): "charm.PostgreSQLBackups._render_pgbackrest_conf_file" ) as _render_pgbackrest_conf_file, patch("charm.PostgreSQLBackups._are_backup_settings_ok") as _are_backup_settings_ok, + patch.object(container.pebble, "send_signal") as _send_signal, ): # Test when S3 parameters are not ok (no operation, but returns success). _are_backup_settings_ok.return_value = (False, "fake error message") @@ -1961,18 +1968,49 @@ def test_start_stop_pgbackrest_service(harness): _stop.assert_not_called() _restart.assert_not_called() - # Test when the service has already started in the primary. + # Test when the service has already started in the primary and is ACTIVE. + # This should send SIGHUP signal to reload configuration. _is_primary_pgbackrest_service_running.return_value = True + + # Add pgbackrest service to Pebble plan using the container's pebble client + container.pebble.add_layer( + "pgbackrest", + { + "services": { + harness.charm.pgbackrest_server_service: { + "override": "replace", + "summary": "pgbackrest server", + "command": "/bin/true", + "startup": "enabled", + } + } + }, + ) + + # Start the service to make it ACTIVE + container.start(harness.charm.pgbackrest_server_service) + assert harness.charm.backup.start_stop_pgbackrest_service() is True _stop.assert_not_called() - _restart.assert_called_once() + _restart.assert_not_called() + _send_signal.assert_called_once() - # Test when this unit is the primary. + # Test when this unit is the primary and service is NOT ACTIVE (INACTIVE). + # This should restart the service. _restart.reset_mock() + _send_signal.reset_mock() _is_primary.return_value = True _is_primary_pgbackrest_service_running.return_value = False + + # Stop the service using the Testing Pebble Client's stop_services method + container.pebble.stop_services([harness.charm.pgbackrest_server_service]) + + # Reset the stop mock (it was called by the testing framework when stopping) + _stop.reset_mock() + assert harness.charm.backup.start_stop_pgbackrest_service() is True _stop.assert_not_called() + _send_signal.assert_not_called() _restart.assert_called_once() diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index cb7eb67060..bb2d09e456 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -73,7 +73,6 @@ def test_on_leader_elected(harness): patch("charm.Patroni.reload_patroni_configuration"), patch("charm.PostgresqlOperatorCharm._patch_pod_labels"), patch("charm.PostgresqlOperatorCharm._create_services") as _create_services, - patch("charm.PostgreSQLUpgrade.idle", new_callable=PropertyMock) as _idle, patch("charm.PostgresqlOperatorCharm.get_secret_from_id", return_value={}), patch("charm.PostgresqlOperatorCharm.get_secret_from_id", return_value={}), ): @@ -143,20 +142,12 @@ def test_on_leader_elected(harness): response = Mock() response.json.return_value = {"code": 403} _create_services.side_effect = ApiError(response=response) - _idle.return_value = True harness.set_leader(False) harness.set_leader() assert isinstance(harness.charm.unit.status, BlockedStatus) assert harness.charm.unit.status.message == "failed to create k8s services" - # No error when upgrading the cluster. - harness.charm.unit.status = ActiveStatus() - _idle.return_value = False - harness.set_leader(False) - harness.set_leader() - assert isinstance(harness.charm.unit.status, ActiveStatus) - # No trust when annotating _client.return_value.get.side_effect = ApiError(response=response) harness.set_leader(False) @@ -188,9 +179,6 @@ def test_on_postgresql_pebble_ready(harness): with ( patch("charm.Path"), patch("charm.PostgresqlOperatorCharm._set_active_status") as _set_active_status, - patch( - "charm.Patroni.rock_postgresql_version", new_callable=PropertyMock - ) as _rock_postgresql_version, patch( "charm.Patroni.primary_endpoint_ready", new_callable=PropertyMock ) as _primary_endpoint_ready, @@ -201,18 +189,15 @@ def test_on_postgresql_pebble_ready(harness): patch("charm.PostgresqlOperatorCharm.postgresql") as _postgresql, patch( "charm.PostgresqlOperatorCharm._create_services", - side_effect=[None, _FakeApiError, _FakeApiError, None], + side_effect=[None, _FakeApiError, None], ) as _create_services, patch("charm.Patroni.member_started") as _member_started, - patch("charm.PostgreSQLUpgrade.idle", new_callable=PropertyMock) as _idle, patch("charm.PostgresqlOperatorCharm._patch_pod_labels"), patch("charm.PostgresqlOperatorCharm._on_leader_elected"), patch("charm.PostgresqlOperatorCharm._push_file_to_workload"), patch("charm.PostgresqlOperatorCharm._create_pgdata") as _create_pgdata, patch("charm.PostgresqlOperatorCharm.get_secret", return_value="secret"), ): - _rock_postgresql_version.return_value = "16.6" - # Mock the primary endpoint ready property values. _primary_endpoint_ready.side_effect = [False, True, True] @@ -233,16 +218,10 @@ def test_on_postgresql_pebble_ready(harness): _set_active_status.assert_not_called() # Check for a Blocked status when a failure happens. - _idle.return_value = True harness.container_pebble_ready(POSTGRESQL_CONTAINER) assert isinstance(harness.model.unit.status, BlockedStatus) _set_active_status.assert_not_called() - # No error when upgrading the cluster. - _idle.return_value = False - harness.container_pebble_ready(POSTGRESQL_CONTAINER) - _set_active_status.assert_called_once() - # Check for the Active status. _set_active_status.reset_mock() harness.container_pebble_ready(POSTGRESQL_CONTAINER) @@ -259,14 +238,10 @@ def test_on_postgresql_pebble_ready(harness): def test_on_postgresql_pebble_ready_no_connection(harness): with ( - patch( - "charm.Patroni.rock_postgresql_version", new_callable=PropertyMock - ) as _rock_postgresql_version, patch("charm.PostgresqlOperatorCharm._create_pgdata"), ): mock_event = MagicMock() mock_event.workload = harness.model.unit.get_container(POSTGRESQL_CONTAINER) - _rock_postgresql_version.return_value = "16.6" harness.charm._on_postgresql_pebble_ready(mock_event) @@ -286,9 +261,6 @@ def test_on_config_changed(harness): patch( "charm.PostgresqlOperatorCharm._validate_config_options" ) as _validate_config_options, - patch( - "charm.PostgreSQLUpgrade.idle", return_value=False, new_callable=PropertyMock - ) as _idle, patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, patch( "charm.PostgresqlOperatorCharm.updated_synchronous_node_count", return_value=True @@ -312,15 +284,8 @@ def test_on_config_changed(harness): mock_event.defer.assert_called_once_with() mock_event.defer.reset_mock() - # Defers if upgrade is not idle + # Defers on db connection error _is_cluster_initialised.return_value = True - mock_event = Mock() - harness.charm._on_config_changed(mock_event) - mock_event.defer.assert_called_once_with() - mock_event.defer.reset_mock() - - # Deferst on db connection error - _idle.return_value = True _validate_config_options.side_effect = psycopg2.OperationalError harness.charm._on_config_changed(mock_event) mock_event.defer.assert_called_once_with() @@ -382,7 +347,6 @@ def test_on_update_status(harness): patch("charm.Patroni.get_primary") as _get_primary, patch("ops.model.Container.pebble") as _pebble, patch("ops.model.Container.restart") as _restart, - patch("upgrade.PostgreSQLUpgrade.idle", return_value="idle"), patch( "charm.PostgresqlOperatorCharm.is_standby_leader", new_callable=PropertyMock, @@ -466,7 +430,6 @@ def test_on_update_status_with_error_on_get_primary(harness): patch("charm.Patroni.member_started") as _member_started, patch("charm.Patroni.get_primary") as _get_primary, patch("ops.model.Container.pebble") as _pebble, - patch("upgrade.PostgreSQLUpgrade.idle", return_value=True), ): # Mock the access to the list of Pebble services. _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)] @@ -640,7 +603,6 @@ def test_on_update_status_after_restore_operation(harness): patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, patch("ops.model.Container.pebble") as _pebble, - patch("upgrade.PostgreSQLUpgrade.idle", return_value=True), ): rel_id = harness.model.get_relation(PEER).id # Mock the access to the list of Pebble services to test a failed restore. @@ -708,31 +670,19 @@ def test_on_update_status_after_restore_operation(harness): def test_on_upgrade_charm(harness): with ( - patch( - "charms.data_platform_libs.v0.upgrade.DataUpgrade._upgrade_supported_check" - ) as _upgrade_supported_check, patch( "charm.PostgresqlOperatorCharm._patch_pod_labels", - side_effect=[None, _FakeApiError, None], + side_effect=[_FakeApiError, None], ) as _patch_pod_labels, patch( "charm.PostgresqlOperatorCharm._create_services", side_effect=[_FakeApiError, None, None], ) as _create_services, patch("charm.PostgresqlOperatorCharm.push_tls_files_to_workload"), - patch("charm.PostgreSQLUpgrade.idle", new_callable=PropertyMock) as _idle, ): - # Test when the cluster is being upgraded. - harness.charm.unit.status = ActiveStatus() - _idle.return_value = False - harness.charm.on.upgrade_charm.emit() - _create_services.assert_not_called() - _patch_pod_labels.assert_called_once() - assert isinstance(harness.charm.unit.status, ActiveStatus) - # Test with a problem happening when trying to create the k8s resources. + harness.charm.unit.status = ActiveStatus() _patch_pod_labels.reset_mock() - _idle.return_value = True harness.charm.on.upgrade_charm.emit() _create_services.assert_called_once() _patch_pod_labels.assert_not_called() @@ -1359,7 +1309,6 @@ def test_update_config(harness): "charm.PostgresqlOperatorCharm._is_workload_running", new_callable=PropertyMock ) as _is_workload_running, patch("charm.Patroni.render_patroni_yml_file") as _render_patroni_yml_file, - patch("charm.PostgreSQLUpgrade") as _upgrade, patch("charm.PostgresqlOperatorCharm.is_primary", return_value=False), patch( "charm.PostgresqlOperatorCharm.is_tls_enabled", new_callable=PropertyMock @@ -1369,7 +1318,6 @@ def test_update_config(harness): rel_id = harness.model.get_relation(PEER).id # Mock some properties. harness.set_can_connect(POSTGRESQL_CONTAINER, True) - harness.add_relation("upgrade", harness.charm.app.name) postgresql_mock.is_tls_enabled = PropertyMock(side_effect=[False, False, False, False]) _is_workload_running.side_effect = [True, True, False, True] _member_started.side_effect = [True, True, False] @@ -1385,7 +1333,7 @@ def test_update_config(harness): is_creating_backup=False, enable_ldap=False, enable_tls=False, - is_no_sync_member=False, + # is_no_sync_member=False, backup_id=None, stanza=None, restore_stanza=None, @@ -1416,7 +1364,7 @@ def test_update_config(harness): is_creating_backup=False, enable_ldap=False, enable_tls=True, - is_no_sync_member=False, + # is_no_sync_member=False, backup_id=None, stanza=None, restore_stanza=None, diff --git a/tests/unit/test_tls.py b/tests/unit/test_tls.py index da80c196bd..53bad12272 100644 --- a/tests/unit/test_tls.py +++ b/tests/unit/test_tls.py @@ -17,8 +17,8 @@ def harness(): harness = Harness(PostgresqlOperatorCharm) # Set up the initial relation and hooks. - peer_rel_id = harness.add_relation(PEER, "postgresql") - harness.add_relation_unit(peer_rel_id, "postgresql/0") + peer_rel_id = harness.add_relation(PEER, "postgresql-k8s") + harness.add_relation_unit(peer_rel_id, "postgresql-k8s/0") harness.begin() yield harness harness.cleanup() diff --git a/tests/unit/test_tls_transfer.py b/tests/unit/test_tls_transfer.py index 3c890daf23..62e008bd27 100644 --- a/tests/unit/test_tls_transfer.py +++ b/tests/unit/test_tls_transfer.py @@ -18,8 +18,8 @@ def harness(): harness = Harness(PostgresqlOperatorCharm) # Set up the initial relation and hooks. - peer_rel_id = harness.add_relation(PEER, "postgresql") - harness.add_relation_unit(peer_rel_id, "postgresql/0") + peer_rel_id = harness.add_relation(PEER, "postgresql-k8s") + harness.add_relation_unit(peer_rel_id, "postgresql-k8s/0") harness.begin() yield harness harness.cleanup() diff --git a/tests/unit/test_upgrade.py b/tests/unit/test_upgrade.py deleted file mode 100644 index c25a54620f..0000000000 --- a/tests/unit/test_upgrade.py +++ /dev/null @@ -1,353 +0,0 @@ -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. -from unittest.mock import MagicMock, PropertyMock, call, patch - -import pytest -import tenacity -from charms.data_platform_libs.v0.upgrade import ( - ClusterNotReadyError, - KubernetesClientError, -) -from lightkube.resources.apps_v1 import StatefulSet -from ops.testing import Harness - -from charm import PostgresqlOperatorCharm -from patroni import SwitchoverFailedError -from tests.unit.helpers import _FakeApiError - -POSTGRESQL_CONTAINER = "postgresql" - - -@pytest.fixture(autouse=True) -def harness(): - """Set up the test.""" - patcher = patch("lightkube.core.client.GenericSyncClient") - patcher.start() - harness = Harness(PostgresqlOperatorCharm) - harness.begin() - upgrade_relation_id = harness.add_relation("upgrade", "postgresql-k8s") - peer_relation_id = harness.add_relation("database-peers", "postgresql-k8s") - for rel_id in (upgrade_relation_id, peer_relation_id): - harness.add_relation_unit(rel_id, "postgresql-k8s/1") - harness.add_relation("restart", harness.charm.app.name) - with harness.hooks_disabled(): - harness.update_relation_data(upgrade_relation_id, "postgresql-k8s/1", {"state": "idle"}) - yield harness - harness.cleanup() - - -def test_is_no_sync_member(harness): - # Test when there is no list of sync-standbys in the relation data. - assert not harness.charm.upgrade.is_no_sync_member - upgrade_relation_id = harness.model.get_relation("upgrade").id - - # Test when the current unit is not part of the list of sync-standbys - # from the relation data. - with harness.hooks_disabled(): - harness.update_relation_data( - upgrade_relation_id, - harness.charm.app.name, - {"sync-standbys": '["postgresql-k8s/1", "postgresql-k8s/2"]'}, - ) - assert harness.charm.upgrade.is_no_sync_member - - # Test when the current unit is part of the list of sync-standbys from the relation data. - with harness.hooks_disabled(): - harness.update_relation_data( - upgrade_relation_id, - harness.charm.app.name, - { - "sync-standbys": f'["{harness.charm.unit.name}", "postgresql-k8s/1", "postgresql-k8s/2"]' - }, - ) - assert not harness.charm.upgrade.is_no_sync_member - - -def test_log_rollback(harness): - with ( - patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, - patch("upgrade.logger.info") as mock_logging, - ): - harness.charm.upgrade.log_rollback_instructions() - calls = [ - call( - "Run `juju refresh --revision postgresql-k8s` to initiate the rollback" - ), - call( - "and `juju run-action postgresql-k8s/leader resume-upgrade` to resume the rollback" - ), - ] - mock_logging.assert_has_calls(calls) - - -def test_on_postgresql_pebble_ready(harness): - with ( - patch("charm.PostgreSQLUpgrade.set_unit_failed") as _set_unit_failed, - patch("charm.PostgreSQLUpgrade.set_unit_completed") as _set_unit_completed, - patch( - "charm.Patroni.is_replication_healthy", new_callable=PropertyMock - ) as _is_replication_healthy, - patch("charm.Patroni.cluster_members", new_callable=PropertyMock) as _cluster_members, - patch("upgrade.wait_fixed", return_value=tenacity.wait_fixed(0)), - patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, - ): - # Set some side effects to test multiple situations. - _member_started.side_effect = [False, True, True, True] - upgrade_relation_id = harness.model.get_relation("upgrade").id - - # Test when the unit status is different from "upgrading". - mock_event = MagicMock() - harness.charm.upgrade._on_postgresql_pebble_ready(mock_event) - _member_started.assert_not_called() - mock_event.defer.assert_not_called() - _set_unit_completed.assert_not_called() - _set_unit_failed.assert_not_called() - - # Test when the unit status is equal to "upgrading", but the member hasn't started yet. - with harness.hooks_disabled(): - harness.update_relation_data( - upgrade_relation_id, harness.charm.unit.name, {"state": "upgrading"} - ) - harness.charm.upgrade._on_postgresql_pebble_ready(mock_event) - _member_started.assert_called_once() - mock_event.defer.assert_called_once() - _set_unit_completed.assert_not_called() - _set_unit_failed.assert_not_called() - - # Test when the unit status is equal to "upgrading", and the member has already started - # but not joined the cluster yet. - _member_started.reset_mock() - mock_event.defer.reset_mock() - _cluster_members.return_value = ["postgresql-k8s-1"] - harness.charm.upgrade._on_postgresql_pebble_ready(mock_event) - _member_started.assert_called_once() - mock_event.defer.assert_not_called() - _set_unit_completed.assert_not_called() - _set_unit_failed.assert_called_once() - - # Test when the member has already joined the cluster, but replication - # is not healthy yet. - _set_unit_failed.reset_mock() - mock_event.defer.reset_mock() - _cluster_members.return_value = [ - harness.charm.unit.name.replace("/", "-"), - "postgresql-k8s-1", - ] - _is_replication_healthy.return_value = False - harness.charm.upgrade._on_postgresql_pebble_ready(mock_event) - mock_event.defer.assert_not_called() - _set_unit_completed.assert_not_called() - _set_unit_failed.assert_called_once() - - # Test when replication is healthy. - _member_started.reset_mock() - _set_unit_failed.reset_mock() - mock_event.defer.reset_mock() - _is_replication_healthy.return_value = True - harness.charm.upgrade._on_postgresql_pebble_ready(mock_event) - _member_started.assert_called_once() - mock_event.defer.assert_not_called() - _set_unit_completed.assert_called_once() - _set_unit_failed.assert_not_called() - - -def test_on_upgrade_changed(harness): - with ( - patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, - patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, - patch( - "charm.PostgresqlOperatorCharm.updated_synchronous_node_count" - ) as _updated_synchronous_node_count, - ): - harness.set_can_connect(POSTGRESQL_CONTAINER, True) - _member_started.return_value = False - relation = harness.model.get_relation("upgrade") - harness.charm.on.upgrade_relation_changed.emit(relation) - _update_config.assert_not_called() - - _member_started.return_value = True - harness.charm.on.upgrade_relation_changed.emit(relation) - _update_config.assert_called_once() - _updated_synchronous_node_count.assert_called_once_with() - - -def test_pre_upgrade_check(harness): - with ( - patch( - "charm.PostgreSQLUpgrade._set_rolling_update_partition" - ) as _set_rolling_update_partition, - patch("charm.PostgreSQLUpgrade._set_list_of_sync_standbys") as _set_list_of_sync_standbys, - patch("charm.Patroni.switchover") as _switchover, - patch("charm.Patroni.get_sync_standby_names") as _get_sync_standby_names, - patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, - patch("charm.Patroni.get_primary") as _get_primary, - patch( - "charm.Patroni.is_creating_backup", new_callable=PropertyMock - ) as _is_creating_backup, - patch("charm.Patroni.are_all_members_ready") as _are_all_members_ready, - ): - harness.set_leader(True) - - # Set some side effects to test multiple situations. - _are_all_members_ready.side_effect = [False, True, True, True, True, True, True] - _is_creating_backup.side_effect = [True, False, False, False, False, False] - _switchover.side_effect = [None, SwitchoverFailedError] - - # Test when not all members are ready. - try: - harness.charm.upgrade.pre_upgrade_check() - assert False - except ClusterNotReadyError: - pass - _switchover.assert_not_called() - _set_list_of_sync_standbys.assert_not_called() - _set_rolling_update_partition.assert_not_called() - - # Test when a backup is being created. - try: - harness.charm.upgrade.pre_upgrade_check() - assert False - except ClusterNotReadyError: - pass - _switchover.assert_not_called() - _set_list_of_sync_standbys.assert_not_called() - _set_rolling_update_partition.assert_not_called() - - # Test when the primary is already the first unit. - unit_zero_name = f"{harness.charm.app.name}/0" - _get_primary.return_value = unit_zero_name - harness.charm.upgrade.pre_upgrade_check() - _switchover.assert_not_called() - _set_list_of_sync_standbys.assert_not_called() - _set_rolling_update_partition.assert_called_once_with( - harness.charm.app.planned_units() - 1 - ) - - # Test when there are no sync-standbys. - _set_rolling_update_partition.reset_mock() - _get_primary.return_value = f"{harness.charm.app.name}/1" - _get_sync_standby_names.return_value = [] - try: - harness.charm.upgrade.pre_upgrade_check() - assert False - except ClusterNotReadyError: - pass - _switchover.assert_not_called() - _set_list_of_sync_standbys.assert_not_called() - _set_rolling_update_partition.assert_not_called() - - # Test when the first unit is a sync-standby. - _set_rolling_update_partition.reset_mock() - _get_sync_standby_names.return_value = [unit_zero_name, f"{harness.charm.app.name}/2"] - harness.charm.upgrade.pre_upgrade_check() - _switchover.assert_called_once_with(unit_zero_name) - _set_list_of_sync_standbys.assert_not_called() - _set_rolling_update_partition.assert_called_once_with( - harness.charm.app.planned_units() - 1 - ) - - # Test when the switchover fails. - _switchover.reset_mock() - _set_rolling_update_partition.reset_mock() - try: - harness.charm.upgrade.pre_upgrade_check() - assert False - except ClusterNotReadyError: - pass - _switchover.assert_called_once_with(unit_zero_name) - _set_list_of_sync_standbys.assert_not_called() - _set_rolling_update_partition.assert_not_called() - - # Test when the first unit is neither the primary nor a sync-standby. - _switchover.reset_mock() - _set_rolling_update_partition.reset_mock() - _get_sync_standby_names.return_value = f'["{harness.charm.app.name}/2"]' - try: - harness.charm.upgrade.pre_upgrade_check() - assert False - except ClusterNotReadyError: - pass - _switchover.assert_not_called() - _set_list_of_sync_standbys.assert_called_once() - _set_rolling_update_partition.assert_not_called() - - -def test_set_list_of_sync_standbys(harness): - with patch("charm.Patroni.get_sync_standby_names") as _get_sync_standby_names: - upgrade_relation_id = harness.model.get_relation("upgrade").id - peer_relation_id = harness.model.get_relation("database-peers").id - # Mock some return values. - _get_sync_standby_names.side_effect = [ - ["postgresql-k8s/1"], - ["postgresql-k8s/0", "postgresql-k8s/1"], - ["postgresql-k8s/1", "postgresql-k8s/2"], - ] - - # Test when the there are less than 3 units in the cluster. - harness.charm.upgrade._set_list_of_sync_standbys() - assert "sync-standbys" not in harness.get_relation_data( - upgrade_relation_id, harness.charm.app - ) - - # Test when the there are 3 units in the cluster. - for rel_id in (upgrade_relation_id, peer_relation_id): - harness.add_relation_unit(rel_id, "postgresql-k8s/2") - with harness.hooks_disabled(): - harness.update_relation_data( - upgrade_relation_id, "postgresql-k8s/2", {"state": "idle"} - ) - harness.charm.upgrade._set_list_of_sync_standbys() - assert ( - harness.get_relation_data(upgrade_relation_id, harness.charm.app)["sync-standbys"] - == '["postgresql-k8s/0"]' - ) - - # Test when the unit zero is already a sync-standby. - for rel_id in (upgrade_relation_id, peer_relation_id): - harness.add_relation_unit(rel_id, "postgresql-k8s/3") - with harness.hooks_disabled(): - harness.update_relation_data( - upgrade_relation_id, "postgresql-k8s/3", {"state": "idle"} - ) - harness.charm.upgrade._set_list_of_sync_standbys() - assert ( - harness.get_relation_data(upgrade_relation_id, harness.charm.app)["sync-standbys"] - == '["postgresql-k8s/0", "postgresql-k8s/1"]' - ) - - # Test when the unit zero is not a sync-standby yet. - harness.charm.upgrade._set_list_of_sync_standbys() - assert ( - harness.get_relation_data(upgrade_relation_id, harness.charm.app)["sync-standbys"] - == '["postgresql-k8s/1", "postgresql-k8s/0"]' - ) - - -def test_set_rolling_update_partition(harness): - with patch("upgrade.Client") as _client: - # Test the successful operation. - harness.charm.upgrade._set_rolling_update_partition(2) - _client.return_value.patch.assert_called_once_with( - StatefulSet, - name=harness.charm.app.name, - namespace=harness.charm.model.name, - obj={"spec": {"updateStrategy": {"rollingUpdate": {"partition": 2}}}}, - ) - - # Test an operation that failed due to lack of Juju's trust flag. - _client.return_value.patch.reset_mock() - _client.return_value.patch.side_effect = _FakeApiError(403) - try: - harness.charm.upgrade._set_rolling_update_partition(2) - assert False - except KubernetesClientError as exception: - assert exception.cause == "`juju trust` needed" - - # Test an operation that failed due to some other reason. - _client.return_value.patch.reset_mock() - _client.return_value.patch.side_effect = _FakeApiError - try: - harness.charm.upgrade._set_rolling_update_partition(2) - assert False - except KubernetesClientError as exception: - assert exception.cause == "broken"