Skip to content

Commit

Permalink
17222 HW/SW inventory: Retain inventory history on central site
Browse files Browse the repository at this point in the history
This change squashes:
 Icba1ae13fcc2ee2da9dcd7a1d00701d46ae12ddd
 I46661437f4569b98418181334debd36cad0db844
 I8e1b96c78bf3ff3677012e616e27d21ce5b4666e
 I22b2f98f58f04a66159a02f568fa6ab4c0c8784b
 Ie95938a60b65e69bfcc57792932e43b30e282df6
 I6590e5bb43ea0bbfebc545b60d1cdeeb288709cd
 Ie5e94c183f816d7bdf52c63df03a8e5d6e76164a
 I0ec85cce2b740cb842d0c206edab85545a696816
 I52263fb40e3eb831cc863eb459bdddb11e134614

SUP-20276
SUP-22016

Change-Id: I7dee3836008c14408fbf3b6589905c483ba82cbb
  • Loading branch information
SoloJacobs committed Jan 14, 2025
1 parent a83a376 commit 37a726f
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 92 deletions.
17 changes: 17 additions & 0 deletions .werks/17222
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Title: HW/SW inventory: Retain inventory history on central site
Class: fix
Compatible: compat
Component: omd
Date: 1736779496
Edition: cre
Level: 1
Version: 2.2.0p39

This change affects customers, which are monitoring hosts on a remote site.
Previously, the <code>Inventory history of host</code> would sporadically disappear on the central site, even though it was shown correctly on the remote site.
This was due to an incorrect implementation of the diskspace utility.
This implementation assumed that the historic data was only needed on the remote site.
With this change, diskspace will only consider deleting the inventory history of hosts, which have been deleted.

The faulty behaviour was introduced in <a href="https://checkmk.com/werk/13242">#Werk 13422</a>.
A similar, but different issue, is addressed in <a href="https://checkmk.com/werk/17223">#Werk 17223</a>.
1 change: 0 additions & 1 deletion omd/packages/check_mk/diskspace
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
cleanup_paths = [
'var/mkeventd/history/*.log',
'var/mkeventd/messages/*.log',
'var/check_mk/inventory_archive/*/*',
'var/check_mk/core/archive/history-*',
]
161 changes: 70 additions & 91 deletions omd/packages/maintenance/diskspace
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ from typing import Any, Literal

from cmk.utils.paths import omd_root, var_dir
from cmk.utils.render import fmt_bytes
from cmk.utils.type_defs import HostName

opt_verbose = "-v" in sys.argv
opt_force = "-f" in sys.argv
Expand Down Expand Up @@ -215,57 +214,32 @@ def _oldest_candidate(file_infos: dict) -> str | None:
return None


def _cleanup_host_directory_for_local_hosts(
cleanup_hosts: set[HostName], base_path: str
) -> list[HostName]:
def _cleanup_host_directories(unaffected_hosts: set[str], base_path: str) -> set[str]:
"""
First find all directories not related to a known host.
"""
if not os.path.isdir(base_path):
return []
return set()

unrelated_dirs: list[str] = []
for host_dir in os.listdir(base_path):
if host_dir not in cleanup_hosts:
unrelated_dirs.append(host_dir)
abandoned = {host_dir for host_dir in os.listdir(base_path) if host_dir not in unaffected_hosts}

cleaned_up_hosts = _check_threshold_and_delete(unrelated_dirs, base_path)
return _check_threshold_and_delete(abandoned, base_path)

return cleaned_up_hosts


def _cleanup_host_directory_for_remote_hosts(cleaned_up_remote_hosts: set, base_path: str) -> list:
"""
Find all directories existing on the local site and return a list of all
matching hosts that are known on remote sites
"""
if not os.path.isdir(base_path):
return []

unrelated_dirs = []
for host_dir in os.listdir(base_path):
if host_dir in cleaned_up_remote_hosts:
unrelated_dirs.append(host_dir)

cleaned_up_hosts = _check_threshold_and_delete(unrelated_dirs, base_path)

return cleaned_up_hosts


def _check_threshold_and_delete(unrelated_dirs: list[str], base_path: str) -> list:
def _check_threshold_and_delete(abandoned_hosts: set[str], base_path: str) -> set[str]:
"""
Find the latest modified file for each directory. When the latest
modified file is older than the threshold, delete all files including
the host base directory.
"""
assert cleanup_abandoned_host_files is not None
cleaned_up_hosts = []
for unrelated_dir in unrelated_dirs:
cleaned_up_hosts = set()
for unrelated_dir in abandoned_hosts:
path = f"{base_path}/{unrelated_dir}"
mtime: float = _newest_modification_time_in_dir(path)
if mtime < time.time() - cleanup_abandoned_host_files:
_delete_files_and_base_directory(path, "abandoned host")
cleaned_up_hosts.append(unrelated_dir)
cleaned_up_hosts.add(unrelated_dir)
else:
_verbose("Found abandoned host path (but not old enough): %s" % path)

Expand Down Expand Up @@ -296,16 +270,7 @@ def _newest_modification_time_in_dir(dir_path: str) -> float:
return mtime


def _get_configured_hosts() -> tuple[set, set, bool]:
"""
Get local known hosts for all kind of sites (central and remote).
For central sites, get also all known hosts, even the ones that are
assigned to remote sites.
"""
local_site_hosts: set = set()
all_hosts: set = set()
is_wato_remote_site = True

def _is_wato_remote_site() -> bool:
file_vars: dict = {}
if (
distr_wato_filepath := Path("~/etc/check_mk/conf.d/distributed_wato.mk").expanduser()
Expand All @@ -316,21 +281,63 @@ def _get_configured_hosts() -> tuple[set, set, bool]:
file_vars,
)

if not file_vars.get("is_wato_slave_site", False):
is_wato_remote_site = False
all_hosts.update(
return file_vars.get("is_wato_slave_site", False)


def _do_cleanup_central_site(retention_time: int, local_site_hosts: set[str]) -> None:
try:
all_hosts = set(
subprocess.check_output(
["check_mk", "--list-hosts", "--all-sites", "--include-offline"], encoding="utf-8"
).splitlines()
)
except subprocess.CalledProcessError as e:
_verbose("Failed to get site hosts (%s). Skipping abandoned host files cleanup" % e)
return

cleaned_up = (
_cleanup_host_directories(
all_hosts,
"%s/inventory_archive" % var_dir,
)
| _cleanup_host_directories(
local_site_hosts,
"%s/var/pnp4nagios/perfdata" % omd_root,
)
| _cleanup_host_directories(
local_site_hosts,
"%s/rrd" % var_dir,
)
)

# Now call Check_MK to clean up other files for the hosts which we have
# cleaned up abandoned files for.
if cleaned_up_deleted_hosts := cleaned_up - all_hosts:
_do_automation_call(cleaned_up_deleted_hosts, "delete-hosts")
if cleaned_up_remote_hosts := cleaned_up & (all_hosts - local_site_hosts):
_do_automation_call(cleaned_up_remote_hosts, "delete-hosts-known-remote")


local_site_hosts.update(
subprocess.check_output(
["check_mk", "--list-hosts", "--include-offline"], encoding="utf-8"
).splitlines()
def _do_cleanup_remote_site(retention_time: int, local_site_hosts: set[str]) -> None:
cleaned_up_non_local_hosts = (
_cleanup_host_directories(
local_site_hosts,
"%s/inventory_archive" % var_dir,
)
| _cleanup_host_directories(
local_site_hosts,
"%s/var/pnp4nagios/perfdata" % omd_root,
)
| _cleanup_host_directories(
local_site_hosts,
"%s/rrd" % var_dir,
)
)

return all_hosts, local_site_hosts, is_wato_remote_site
# Now call Check_MK to clean up other files for the hosts which we have
# cleaned up abandoned files for.
if cleaned_up_non_local_hosts:
_do_automation_call(cleaned_up_non_local_hosts, "delete-hosts")


def _do_cleanup_abandoned_host_files() -> None:
Expand All @@ -345,8 +352,14 @@ def _do_cleanup_abandoned_host_files() -> None:
if not cleanup_abandoned_host_files:
return

is_wato_remote_site = _is_wato_remote_site()

try:
all_hosts, local_site_hosts, is_wato_remote_site = _get_configured_hosts()
local_site_hosts = set(
subprocess.check_output(
["check_mk", "--list-hosts", "--include-offline"], encoding="utf-8"
).splitlines()
)
except subprocess.CalledProcessError as e:
_verbose("Failed to get site hosts (%s). Skipping abandoned host files cleanup" % e)
return
Expand All @@ -355,44 +368,10 @@ def _do_cleanup_abandoned_host_files() -> None:
_verbose("Found no hosts. Be careful and not cleaning up anything.")
return

cleanup_hosts = all_hosts if not is_wato_remote_site else local_site_hosts

# Base directories where each host has a sub-directory below with
# host related files inside
path_patterns: list[str] = [
"%s/inventory_archive" % var_dir,
"%s/rrd" % var_dir,
"%s/var/pnp4nagios/perfdata" % omd_root,
]

cleaned_up_local_hosts: set = set()
for base_path in path_patterns:
cleaned_up_local_hosts.update(
_cleanup_host_directory_for_local_hosts(
cleanup_hosts,
base_path,
)
)

# Now call Check_MK to clean up other files for the hosts which we have
# cleaned up abandoned files for.
if cleaned_up_local_hosts:
_do_automation_call(cleaned_up_local_hosts, "delete-hosts")

# Now call Check_MK to clean up files for hosts that still have files local
# but are only known on remote sites
if all_hosts:
remote_site_hosts = all_hosts - local_site_hosts
cleaned_up_remote_hosts: set = set()
for base_path in path_patterns:
cleaned_up_remote_hosts.update(
_cleanup_host_directory_for_remote_hosts(
remote_site_hosts,
base_path,
)
)
if cleaned_up_remote_hosts:
_do_automation_call(cleaned_up_remote_hosts, "delete-hosts-known-remote")
if is_wato_remote_site:
_do_cleanup_remote_site(cleanup_abandoned_host_files, local_site_hosts)
else:
_do_cleanup_central_site(cleanup_abandoned_host_files, local_site_hosts)


def _cleanup_aged() -> None:
Expand Down

0 comments on commit 37a726f

Please sign in to comment.