Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[21.05] fc-ceph: improve osd safety check UX #806

Merged
merged 4 commits into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions pkgs/fc/ceph/src/fc/ceph/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,18 @@ def main(args=sys.argv[1:]):
"instead of autodetecting it.",
)
parser_destroy.add_argument(
"--unsafe-destroy",
"--no-safety-check",
action="store_true",
help="Skip the check whether an OSD is safe to destroy without "
"affecting data redundancy. This can result in data loss or cluster failure!!",
"reducing data redundancy below the point of cluster availability. "
"WARNING: This can result in data loss or cluster failure!!",
)
parser_destroy.add_argument(
"--strict-safety-check",
action="store_true",
help="Stricter check whether destruction/stopping of the OSD reduces "
"the data availability at all, even when it is not below the point of "
"cluster availability.",
)
parser_destroy.set_defaults(action="destroy")

Expand Down Expand Up @@ -117,6 +125,20 @@ def main(args=sys.argv[1:]):
help="IDs of OSD to deactivate. "
"Use `all` to deactivate all local OSDs.",
)
parser_deactivate.add_argument(
"--no-safety-check",
action="store_true",
help="Skip the check whether an OSD is safe to stop without "
"reducing data redundancy below the point of cluster availability. "
"WARNING: This can result in data loss or cluster failure!!",
)
parser_deactivate.add_argument(
"--strict-safety-check",
action="store_true",
help="Stricter check whether stopping of the OSD reduces "
"the data availability at all, even when it is not below the point of "
"cluster availability.",
)
parser_deactivate.set_defaults(action="deactivate")

parser_reactivate = osd_sub.add_parser(
Expand Down Expand Up @@ -146,10 +168,18 @@ def main(args=sys.argv[1:]):
"objectstore type.\nThe current type is detected automatically.",
)
parser_rebuild.add_argument(
"--unsafe-destroy",
"--no-safety-check",
action="store_true",
help="Skip the check whether an OSD is safe to destroy without "
"affecting data redundancy. This can result in data loss or cluster failure!!",
"reducing data redundancy below the point of cluster availability. "
"WARNING: This can result in data loss or cluster failure!!",
)
parser_rebuild.add_argument(
"--strict-safety-check",
action="store_true",
help="Stricter check whether destruction/stopping of the OSD reduces "
"the data availability at all, even when it is not below the point of "
"cluster availability.",
)
parser_rebuild.add_argument(
"ids",
Expand Down
124 changes: 91 additions & 33 deletions pkgs/fc/ceph/src/fc/ceph/osd/nautilus.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import time
import traceback
from subprocess import CalledProcessError
from typing import Optional
from typing import List, Optional

from fc.ceph.util import kill, mount_status, run

Expand Down Expand Up @@ -152,20 +152,26 @@ def _activate_single(
def destroy(
self,
ids: str,
unsafe_destroy: bool,
no_safety_check: bool,
strict_safety_check: bool,
force_objectstore_type: Optional[str] = None,
):
ids = self._parse_ids(ids, allow_non_local=f"DESTROY {ids}")

for id_ in ids:
try:
osd = OSD(id_, type=force_objectstore_type)
osd.purge(unsafe_destroy)
osd.purge(no_safety_check, strict_safety_check)
except Exception:
traceback.print_exc()

def deactivate(
self, ids: str, as_systemd_unit: bool = False, flush: bool = False
self,
ids: str,
as_systemd_unit: bool = False,
flush: bool = False,
no_safety_check: bool = False,
strict_safety_check: bool = False,
):
ids = self._parse_ids(ids)

Expand All @@ -176,14 +182,19 @@ def deactivate(
osd = OSD(id_)
self._deactivate_single(osd, as_systemd_unit, flush)
else:
if no_safety_check:
print("WARNING: Skipping stop safety check.")
else:
GenericOSD.run_safety_check(ids, strict_safety_check)
threads = []
for id_ in ids:
try:
osd = OSD(id_)
thread = threading.Thread(
target=lambda: self._deactivate_single(
osd, as_systemd_unit, flush
)
),
name=id_,
)
thread.start()
threads.append(thread)
Expand All @@ -193,6 +204,9 @@ def deactivate(
for thread in threads:
thread.join()

deactivated_osds = ", ".join([str(t.name) for t in threads])
print("Successfully deactivated OSDs", deactivated_osds)

def _deactivate_single(
self,
osd: "GenericOSD",
Expand Down Expand Up @@ -225,7 +239,8 @@ def rebuild(
self,
ids: str,
journal_size: str,
unsafe_destroy: bool,
no_safety_check: bool,
strict_safety_check: bool,
target_objectstore_type: Optional[str] = None,
):
ids = self._parse_ids(ids)
Expand All @@ -234,7 +249,10 @@ def rebuild(
try:
osd = OSD(id_)
osd.rebuild(
journal_size, unsafe_destroy, target_objectstore_type
journal_size=journal_size,
no_safety_check=no_safety_check,
target_objectstore_type=target_objectstore_type,
strict_safety_check=strict_safety_check,
)
except Exception:
traceback.print_exc()
Expand Down Expand Up @@ -469,25 +487,60 @@ def _create_crush_and_auth(self, volume, crush_location):

run.ceph("osd", "crush", "add", self.name, str(weight), crush_location)

def purge(self, unsafe_destroy):
"""Deletes an osd, including removal of auth keys and crush map entry"""
@staticmethod
def run_safety_check(ids: List[int], strict_safety_check: bool):
osnyx marked this conversation as resolved.
Show resolved Hide resolved
"""
Run a safety check on what would happen if the OSDs specified in `ids`
became unavailable.

# Safety net
if unsafe_destroy:
print("WARNING: Skipping destroy safety check.")
else:
By default, the `ceph osd ok-to-stop` is run and checks for remaining
data availability.
With `stric_safety_check`, the more strict `ceph osd safe-to-destroy`
checks whether edundancy is affected in any ways.

Raises a SystemExit if the check fails.
"""
id_str = " ".join(map(str, ids))
if strict_safety_check:
try:
run.ceph("osd", "safe-to-destroy", str(self.id))
run.ceph("osd", "safe-to-destroy", id_str)
except CalledProcessError as e:
print(
# fmt: off
"OSD not safe to destroy:", e.stderr,
"\nTo override this check, specify `--unsafe-destroy`. This can "
"\nTo override this check, remove the `--strict-safety-check` flag. "
"This can lead to reduced data redundancy, still within safety margins."
# fmt: on
)
# ceph already returns ERRNO-style returncodes, so just pass them through
sys.exit(e.returncode)
else:
try:
run.ceph("osd", "ok-to-stop", id_str)
except CalledProcessError as e:
print(
# fmt: off
"OSD not okay to stop:", e.stderr,
"\nTo override this check, specify `--no-safety-check`. This can "
"cause data loss or cluster failure!!"
# fmt: on
)
# do we have some generic or specific error return codes?
sys.exit(10)
# ceph already returns ERRNO-style returncodes, so just pass them through
sys.exit(e.returncode)

def purge(self, no_safety_check: bool, strict_safety_check: bool):
"""Deletes an osd, including removal of auth keys and crush map entry"""

# Safety net
if strict_safety_check and no_safety_check:
print(
"--no-safety-check and --strict-safety-check are incompatible flags."
)
sys.exit(10)
if no_safety_check:
print("WARNING: Skipping destroy safety check.")
else:
self.run_safety_check([self.id], strict_safety_check)

print(f"Destroying OSD {self.id} ...")
osdmanager = OSDManager()
Expand Down Expand Up @@ -586,7 +639,8 @@ def _destroy_and_rebuild_to(
target_osd_type: str,
journal: str,
journal_size: str,
unsafe_destroy: bool,
no_safety_check: bool,
strict_safety_check: bool,
device: str,
crush_location: str,
):
Expand All @@ -597,7 +651,7 @@ def _destroy_and_rebuild_to(
# `purge` also removes crush location and auth. A `destroy` would keep them, but
# as we re-use the creation commands which always handle crush and auth as well,
# for simplicity's sake this optimisation is not used.
self.purge(unsafe_destroy)
self.purge(no_safety_check, strict_safety_check)

# This is an "interesting" turn-around ...
manager = OSDManager()
Expand Down Expand Up @@ -736,8 +790,8 @@ def _create_post_and_register(self, lvm_journal_path):
# fmt: on
)

def purge(self, unsafe_destroy):
super().purge(unsafe_destroy)
def purge(self, no_safety_check, strict_safety_check):
super().purge(no_safety_check, strict_safety_check)

# Try deleting an external journal. The internal journal was already
# deleted during the generic destroy of the VG.
Expand All @@ -749,7 +803,8 @@ def purge(self, unsafe_destroy):
def rebuild(
self,
journal_size: str,
unsafe_destroy: bool,
no_safety_check: bool,
strict_safety_check: bool,
target_objectstore_type: Optional[str],
):
"""Fully destroy and create the FileStoreOSD again with the same properties,
Expand Down Expand Up @@ -784,10 +839,11 @@ def rebuild(
print(f"--journal={journal}")

self._destroy_and_rebuild_to(
target_osd_type,
journal,
journal_size,
unsafe_destroy,
target_osd_type=target_osd_type,
journal=journal,
journal_size=journal_size,
no_safety_check=no_safety_check,
strict_safety_check=strict_safety_check,
**oldosd_properties,
)

Expand Down Expand Up @@ -944,8 +1000,8 @@ def _create_post_and_register(self, lvm_wal_path):
# fmt: on
)

def purge(self, unsafe_destroy):
super().purge(unsafe_destroy)
def purge(self, no_safety_check, strict_safety_check):
super().purge(no_safety_check, strict_safety_check)

try:
run.lvremove("-f", self._locate_wal_lv(), check=False)
Expand All @@ -957,7 +1013,8 @@ def purge(self, unsafe_destroy):
def rebuild(
self,
journal_size: str,
unsafe_destroy: bool,
no_safety_check: bool,
strict_safety_check: bool,
target_objectstore_type: Optional[str],
):
"""Fully destroy and create the FileStoreOSD again with the same properties,
Expand Down Expand Up @@ -985,9 +1042,10 @@ def rebuild(
print(f"--journal={journal}")

self._destroy_and_rebuild_to(
target_osd_type,
journal,
journal_size,
unsafe_destroy,
target_osd_type=target_osd_type,
journal=journal,
journal_size=journal_size,
no_safety_check=no_safety_check,
strict_safety_check=strict_safety_check,
**oldosd_properties,
)
Loading
Loading