diff --git a/files/docker.cfg b/files/docker.cfg new file mode 100644 index 0000000000000000000000000000000000000000..8274c33b5ebed73b9d537ee70a0f03f3079fefcb --- /dev/null +++ b/files/docker.cfg @@ -0,0 +1,54 @@ +# Copyright (C) 2019 Checkmk GmbH - License: GNU General Public License v2 +# This file is part of Checkmk (https://checkmk.com). It is subject to the terms and +# conditions defined in the file COPYING, which is part of this source code package. + +# This is an example configuration file for the plugin +# +# mk_docker.py +# +# It is designed to give you an impression of available +# options. The specific choice in this file is a valid setup, +# but probably not suitable for your use case. +# If you intend to run the plugin with the default options, +# you do not need any configuration file at all. + +# You must specify one section of the name DOCKER (additional sections are ignored). +[DOCKER] + +# SELECTION OF AGENT SECTIONS (SERVICES) TO CREATE +# If some of the sections take too long to run, and you don't need them, you +# can disable them by specifying a comma separated list (Default: empty string +# - run all sections). To disable the sections <<<docker_node_disk_usage>>> +# and <<<docker_node_images>>>, for example, provide: +skip_sections: docker_node_disk_usage,docker_node_images +# You may skip any of the following sections: +# * docker_node_disk_usage: get df like info of disk usage (may take long) +# * docker_node_images: get detailed information on all images and containers +# (for HW/SW inventory) +# * docker_node_network: get network information +# The following sections send piggyback information to monitored containers: +# * docker_container_node_name: display nodes name on container +# * docker_container_status: container status/health according to docker health API +# * docker_container_labels: containers labels +# * docker_container_network: containers network configuration +# * docker_container_agent: retrieve information by running the +# check_mk_agent inside the container +# If no agent was installed on the container: +# * docker_container_mem: container memory stats +# * docker_container_cpu: container cpu utilization +# * docker_container_diskstat container disk stats + +# CONTAINER ID +# You can choose what to use as the container identifier. This will +# affect the name used for the piggyback host corresponding to the +# container, as well as items for services created on the node for each +# container. +# By default, the identifier is assumed to be the first 12 characters +# of the container UUID. You can choose to use the full ID or the containers +# name instead. Allowed values are "short" (the default), "long" and "name". +container_id: name + +# BASE URL +# By default, we are trying to connect to the docker API engine +# via the unix socket: +base_url: unix://var/run/docker.sock \ No newline at end of file diff --git a/files/mk_docker.py b/files/mk_docker.py new file mode 100644 index 0000000000000000000000000000000000000000..81caeb2497363a0e751df98e6b7517577bbb5585 --- /dev/null +++ b/files/mk_docker.py @@ -0,0 +1,747 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright (C) 2019 Checkmk GmbH - License: GNU General Public License v2 +# This file is part of Checkmk (https://checkmk.com). It is subject to the terms and +# conditions defined in the file COPYING, which is part of this source code package. +r"""Check_MK Agent Plugin: mk_docker.py + +This plugin is configured using an ini-style configuration file, +i.e. a file with lines of the form 'key: value'. +At 'agents/cfg_examples/mk_docker.cfg' (relative to the check_mk +source code directory ) you should find some example configuration +files. For more information on possible configurations refer to the +file docker.cfg in said directory. +The docker library must be installed on the system executing the +plugin ("pip install docker"). + +This plugin it will be called by the agent without any arguments. +""" + +from __future__ import with_statement + +__version__ = "2.2.0p8" + +# NOTE: docker is available for python versions from 2.6 / 3.3 + +import argparse +import configparser +import functools +import json +import logging +import multiprocessing +import os +import pathlib +import struct +import sys +import time + +try: + from typing import Dict, Tuple, Union # noqa: F401 # pylint: disable=unused-import +except ImportError: + pass + + +def which(prg): + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, prg)) and os.access(os.path.join(path, prg), os.X_OK): + return os.path.join(path, prg) + return None + + +# The "import docker" checks below result in agent sections being created. This +# is a way to end the plugin in case it is being executed on a non docker host +if ( + not os.path.isfile("/var/lib/docker") + and not os.path.isfile("/var/run/docker") + and not which("docker") +): + sys.stderr.write("mk_docker.py: Does not seem to be a docker host. Terminating.\n") + sys.exit(1) + +try: + import docker # type: ignore[import] +except ImportError: + sys.stdout.write( + "<<<docker_node_info:sep(124)>>>\n" + "@docker_version_info|{}\n" + '{"Critical": "Error: mk_docker requires the docker library.' + ' Please install it on the monitored system (%s install docker)."}\n' + % ("pip3" if sys.version_info.major == 3 else "pip") + ) + sys.exit(0) + +if int(docker.__version__.split(".", 1)[0]) < 2: + sys.stdout.write( + "<<<docker_node_info:sep(124)>>>\n" + "@docker_version_info|{}\n" + '{"Critical": "Error: mk_docker requires the docker library >= 2.0.0.' + ' Please install it on the monitored system (%s install docker)."}\n' + % ("pip3" if sys.version_info.major == 3 else "pip") + ) + sys.exit(0) + +DEBUG = "--debug" in sys.argv[1:] + +VERSION = "0.1" + +DEFAULT_CFG_FILE = os.path.join(os.getenv("MK_CONFDIR", ""), "docker.cfg") + +DEFAULT_CFG_SECTION = { + "base_url": "unix://var/run/docker.sock", + "skip_sections": "", + "container_id": "short", +} + +LOGGER = logging.getLogger(__name__) + + +def parse_arguments(argv=None): + if argv is None: + argv = sys.argv[1:] + + prog, descr, epilog = __doc__.split("\n\n") + parser = argparse.ArgumentParser(prog=prog, description=descr, epilog=epilog) + parser.add_argument( + "--debug", action="store_true", help="""Debug mode: raise Python exceptions""" + ) + parser.add_argument( + "-v", + "--verbose", + action="count", + default=0, + help="""Verbose mode (for even more output use -vvv)""", + ) + parser.add_argument( + "-c", + "--config-file", + default=DEFAULT_CFG_FILE, + help="""Read config file (default: $MK_CONFDIR/docker.cfg)""", + ) + + args = parser.parse_args(argv) + + fmt = "%%(levelname)5s: %s%%(message)s" + if args.verbose == 0: + LOGGER.propagate = False + elif args.verbose == 1: + logging.basicConfig(level=logging.INFO, format=fmt % "") + else: + logging.basicConfig(level=logging.DEBUG, format=fmt % "(line %(lineno)3d) ") + if args.verbose < 3: + logging.getLogger("urllib3").setLevel(logging.WARNING) + + LOGGER.debug("parsed args: %r", args) + return args + + +def get_config(cfg_file): + config = configparser.ConfigParser(DEFAULT_CFG_SECTION) + LOGGER.debug("trying to read %r", cfg_file) + files_read = config.read(cfg_file) + LOGGER.info("read configration file(s): %r", files_read) + section_name = "DOCKER" if config.sections() else "DEFAULT" + conf_dict = dict(config.items(section_name)) # type: Dict[str, Union[str, Tuple]] + skip_sections = conf_dict.get("skip_sections", "") + if isinstance(skip_sections, str): + skip_list = skip_sections.split(",") + conf_dict["skip_sections"] = tuple(n.strip() for n in skip_list) + + return conf_dict + + +class Section(list): + """a very basic agent section class""" + + _OUTPUT_LOCK = multiprocessing.Lock() + + version_info = { + "PluginVersion": VERSION, + "DockerPyVersion": docker.__version__, + } + + # Should we need to parallelize one day, change this to be + # more like the Section class in agent_azure, for instance + def __init__(self, name=None, piggytarget=None): + super().__init__() + if piggytarget is not None: + self.append("<<<<%s>>>>" % piggytarget) + if name is not None: + self.append("<<<docker_%s:sep(124)>>>" % name) + version_json = json.dumps(Section.version_info) + self.append("@docker_version_info|%s" % version_json) + self.append("<<<docker_%s:sep(0)>>>" % name) + + def write(self): + if self[0].startswith("<<<<"): + self.append("<<<<>>>>") + with self._OUTPUT_LOCK: + for line in self: + sys.stdout.write("%s\n" % line) + sys.stdout.flush() + + +def report_exception_to_server(exc, location): + LOGGER.info("handling exception: %s", exc) + msg = "Plugin exception in %s: %s" % (location, exc) + sec = Section("node_info") + sec.append(json.dumps({"Unknown": msg})) + sec.write() + + +class ParallelDfCall: + """handle parallel calls of super().df() + + The Docker API will only allow one super().df() call at a time. + This leads to problems when the plugin is executed multiple times + in parallel. + """ + + def __init__(self, call): + self._call = call + self._vardir = pathlib.Path(os.getenv("MK_VARDIR", "")) + self._spool_file = self._vardir / "mk_docker_df.spool" + self._tmp_file_templ = "mk_docker_df.tmp.%s" + self._my_tmp_file = self._vardir / (self._tmp_file_templ % os.getpid()) + + def __call__(self): + try: + self._my_tmp_file.touch() + data = self._new_df_result() + except docker.errors.APIError as exc: + LOGGER.debug("df API call failed: %s", exc) + data = self._spool_df_result() + else: + # the API call succeeded, no need for any tmp files + for file_ in self._iter_tmp_files(): + self._unlink(file_) + finally: + # what ever happens: remove my tmp file + self._unlink(self._my_tmp_file) + + return data + + def _new_df_result(self): + data = self._call() + self._write_df_result(data) + return data + + def _iter_tmp_files(self): + return self._vardir.glob(self._tmp_file_templ % "*") + + @staticmethod + def _unlink(file_): + try: + file_.unlink() + except OSError: + pass + + def _spool_df_result(self): + # check every 0.1 seconds + tick = 0.1 + # if the df command takes more than 60 seconds, you probably want to + # execute the plugin asynchronously. This should cover a majority of cases. + timeout = 60 + for _ in range(int(timeout / tick)): + time.sleep(tick) + if not any(self._iter_tmp_files()): + break + + return self._read_df_result() + + def _write_df_result(self, data): + with self._my_tmp_file.open("wb") as file_: + file_.write(json.dumps(data).encode("utf-8")) + try: + self._my_tmp_file.rename(self._spool_file) + except OSError: + # CMK-12642: It can happen that two df calls succeed almost at the same time. Then, one + # process attempts to move while the other one already deleted all temp files. + pass + + def _read_df_result(self): + """read from the spool file + + Don't handle FileNotFound - the plugin can deal with it, and it's easier to debug. + """ + with self._spool_file.open() as file_: + return json.loads(file_.read()) + + +class MKDockerClient(docker.DockerClient): + """a docker.DockerClient that caches containers and node info""" + + API_VERSION = "auto" + _DEVICE_MAP_LOCK = multiprocessing.Lock() + + def __init__(self, config): + super().__init__(config["base_url"], version=MKDockerClient.API_VERSION) + all_containers = _robust_inspect(self, "containers") + if config["container_id"] == "name": + self.all_containers = {c.attrs["Name"].lstrip("/"): c for c in all_containers} + elif config["container_id"] == "long": + self.all_containers = {c.attrs["Id"]: c for c in all_containers} + else: + self.all_containers = {c.attrs["Id"][:12]: c for c in all_containers} + self._env = {"REMOTE": os.getenv("REMOTE", "")} + self._container_stats = {} + self._device_map = None + self.node_info = self.info() + + self._df_caller = ParallelDfCall(call=super().df) + + def df(self): + return self._df_caller() + + def device_map(self): + with self._DEVICE_MAP_LOCK: + if self._device_map is not None: + return self._device_map + + self._device_map = {} + for device in os.listdir("/sys/block"): + with open("/sys/block/%s/dev" % device) as handle: + self._device_map[handle.read().strip()] = device + + return self._device_map + + @staticmethod + def iter_socket(sock, descriptor): + """iterator to recv data from container socket""" + header = docker.utils.socket.read(sock, 8) + while header: + actual_descriptor, length = struct.unpack(">BxxxL", header) + while length: + data = docker.utils.socket.read(sock, length) + length -= len(data) + LOGGER.debug("Received data: %r", data) + if actual_descriptor == descriptor: + yield data.decode("UTF-8") + header = docker.utils.socket.read(sock, 8) + + def get_stdout(self, exec_return_val): + """read stdout from container process""" + if isinstance(exec_return_val, tuple): + # it's a tuple since version 3.0.0 + exit_code, sock = exec_return_val + if exit_code not in (0, None): + return "" + else: + sock = exec_return_val + + return "".join(self.iter_socket(sock, 1)) + + def run_agent(self, container): + """run checkmk agent in container""" + result = container.exec_run(["check_mk_agent"], environment=self._env, socket=True) + return self.get_stdout(result) + + def get_container_stats(self, container_key): + """return cached container stats""" + try: + return self._container_stats[container_key] + except KeyError: + pass + + container = self.all_containers[container_key] + if not container.status == "running": + return self._container_stats.setdefault(container_key, None) + + # We use the streaming mode here because it faciliates error handling. If a container is + # removed at exactly the same time when we query the stats, we get StopIteration in + # streaming mode. In non-streaming mode, the error type is version-dependent. + stats_generator = container.stats(stream=True, decode=True) + try: + next(stats_generator) # we need to advance the generator by one to get useful data + stats = next(stats_generator) + except ( + # container was removed in between collecting containers and here + docker.errors.NotFound, + # container is removed just now; it could be that under very old docker versions (eg. + # 1.31), there are other scenarios causing this exception (SUP-10974) + StopIteration, + ): + return self._container_stats.setdefault(container_key, None) + + return self._container_stats.setdefault(container_key, stats) + + +def time_it(func): + """Decorator to time the function""" + + @functools.wraps(func) + def wrapped(*args, **kwargs): + before = time.time() + try: + return func(*args, **kwargs) + finally: + LOGGER.info("%r took %ss", func.__name__, time.time() - before) + + return wrapped + + +@time_it +def set_version_info(client): + data = client.version() + LOGGER.debug(data) + Section.version_info["ApiVersion"] = data.get("ApiVersion") + + +# . +# .--Sections------------------------------------------------------------. +# | ____ _ _ | +# | / ___| ___ ___| |_(_) ___ _ __ ___ | +# | \___ \ / _ \/ __| __| |/ _ \| '_ \/ __| | +# | ___) | __/ (__| |_| | (_) | | | \__ \ | +# | |____/ \___|\___|\__|_|\___/|_| |_|___/ | +# | | +# +----------------------------------------------------------------------+ +# | | +# '----------------------------------------------------------------------' + + +def is_disabled_section(config, section_name): + """Skip the section, if configured to do so""" + if section_name in config["skip_sections"]: + LOGGER.info("skipped section: %s", section_name) + return True + return False + + +@time_it +def section_node_info(client): + LOGGER.debug(client.node_info) + section = Section("node_info") + section.append(json.dumps(client.node_info)) + section.write() + + +@time_it +def section_node_disk_usage(client): + """docker system df""" + section = Section("node_disk_usage") + try: + data = client.df() + except docker.errors.APIError as exc: + if DEBUG: + raise + section.write() + LOGGER.exception(exc) + return + LOGGER.debug(data) + + def get_row(type_, instances, is_inactive, key="Size"): + inactive = [i for i in instances if is_inactive(i)] + item_data = { + "type": type_, + "size": sum(i.get(key, 0) for i in instances), + "reclaimable": sum(i.get(key, 0) for i in inactive), + "count": len(instances), + "active": len(instances) - len(inactive), + } + return json.dumps(item_data) + + # images: + images = data.get("Images") or [] + row = get_row("images", images, lambda i: i["Containers"] == 0) + section.append(row) + + # containers: + containers = data.get("Containers") or [] + row = get_row("containers", containers, lambda c: c["State"] != "running", key="SizeRw") + section.append(row) + + # volumes + volumes = [v.get("UsageData", {}) for v in data.get("Volumes") or []] + if not any(-1 in v.values() for v in volumes): + row = get_row("volumes", volumes, lambda v: v.get("RefCount", 0) == 0) + section.append(row) + + # build_cache: + build_cache = data.get("BuildCache") or [] + row = get_row("buildcache", build_cache, lambda b: b.get("InUse")) + section.append(row) + + section.write() + + +def _robust_inspect(client, docker_object): + object_map = { + "images": { + "api": client.api.images, + "getter": client.images.get, + "kwargs": {}, + }, + "containers": { + "api": client.api.containers, + "getter": client.containers.get, + "kwargs": {"all": True}, + }, + } + if docker_object not in object_map: + raise RuntimeError("Unkown docker object: %s" % docker_object) + + api = object_map[docker_object]["api"] + getter = object_map[docker_object]["getter"] + kwargs = object_map[docker_object]["kwargs"] + # workaround instead of calling client.OBJECT.list() directly to be able to + # ignore errors when OBJECT was removed in between listing available OBJECT + # and getting detailed information about them + for response in api(**kwargs): + try: + yield getter(response["Id"]) + except docker.errors.NotFound: + pass + + +@time_it +def section_node_images(client): + """in subsections list [[[images]]] and [[[containers]]]""" + section = Section("node_images") + + images = _robust_inspect(client, "images") + LOGGER.debug(images) + section.append("[[[images]]]") + for image in images: + section.append(json.dumps(image.attrs)) + + LOGGER.debug(client.all_containers) + section.append("[[[containers]]]") + for container in client.all_containers.values(): + section.append(json.dumps(container.attrs)) + + section.write() + + +@time_it +def section_node_network(client): + networks = client.networks.list(filters={"driver": "bridge"}) + section = Section("node_network") + section += [json.dumps(n.attrs) for n in networks] + section.write() + + +def section_container_node_name(client, container_id): + node_name = client.node_info.get("Name") + section = Section("container_node_name", piggytarget=container_id) + section.append(json.dumps({"NodeName": node_name})) + section.write() + + +def section_container_status(client, container_id): + container = client.all_containers[container_id] + status = container.attrs.get("State", {}) + + healthcheck = container.attrs.get("Config", {}).get("Healthcheck") + if healthcheck: + status["Healthcheck"] = healthcheck + restart_policy = container.attrs.get("HostConfig", {}).get("RestartPolicy") + if restart_policy: + status["RestartPolicy"] = restart_policy + + try: + status["ImageTags"] = container.image.tags + except docker.errors.ImageNotFound: + # image has been removed while container is still running + pass + status["NodeName"] = client.node_info.get("Name") + + section = Section("container_status", piggytarget=container_id) + section.append(json.dumps(status)) + section.write() + + +def section_container_labels(client, container_id): + container = client.all_containers[container_id] + section = Section("container_labels", piggytarget=container_id) + section.append(json.dumps(container.labels)) + section.write() + + +def section_container_network(client, container_id): + container = client.all_containers[container_id] + network = container.attrs.get("NetworkSettings", {}) + section = Section("container_network", piggytarget=container_id) + section.append(json.dumps(network)) + section.write() + + +def _is_not_running_exception(exception): + return ( + exception.response.status_code + in ( + 409, + 500, # Thrown by old docker versions: SUP-10974 + ) + and "is not running" in exception.explanation + ) + + +def section_container_agent(client, container_id): + container = client.all_containers[container_id] + if container.status != "running": + return True + try: + result = client.run_agent(container) + except docker.errors.APIError as e: + # container was removed in between collecting containers and here + if _is_not_running_exception(e): + return True + raise e + success = "<<<check_mk>>>" in result + if success: + LOGGER.debug("running check_mk_agent in container %s: ok", container_id) + section = Section(piggytarget=container_id) + section.append(result) + section.write() + else: + LOGGER.warning("running check_mk_agent in container %s failed: %s", container_id, result) + return success + + +def section_container_mem(client, container_id): + stats = client.get_container_stats(container_id) + if stats is None: # container not running + return + container_mem = stats["memory_stats"] + section = Section("container_mem", piggytarget=container_id) + section.append(json.dumps(container_mem)) + section.write() + + +def section_container_cpu(client, container_id): + stats = client.get_container_stats(container_id) + if stats is None: # container not running + return + container_cpu = stats["cpu_stats"] + section = Section("container_cpu", piggytarget=container_id) + section.append(json.dumps(container_cpu)) + section.write() + + +def section_container_diskstat(client, container_id): + stats = client.get_container_stats(container_id) + if stats is None: # container not running + return + container_blkio = stats["blkio_stats"] + container_blkio["time"] = time.time() + container_blkio["names"] = client.device_map() + section = Section("container_diskstat", piggytarget=container_id) + section.append(json.dumps(container_blkio)) + section.write() + + +NODE_SECTIONS = ( + ("docker_node_info", section_node_info), + ("docker_node_disk_usage", section_node_disk_usage), + ("docker_node_images", section_node_images), + ("docker_node_network", section_node_network), +) + +CONTAINER_API_SECTIONS = ( + ("docker_container_node_name", section_container_node_name), + ("docker_container_status", section_container_status), + ("docker_container_labels", section_container_labels), + ("docker_container_network", section_container_network), +) + +CONTAINER_API_SECTIONS_NO_AGENT = ( + ("docker_container_mem", section_container_mem), + ("docker_container_cpu", section_container_cpu), + ("docker_container_diskstat", section_container_diskstat), +) + + +def call_node_sections(client, config): + for name, section in NODE_SECTIONS: + if is_disabled_section(config, name): + continue + try: + section(client) + except Exception as exc: + if DEBUG: + raise + report_exception_to_server(exc, section.__name__) + + +def call_container_sections(client, config): + jobs = [] + for container_id in client.all_containers: + job = multiprocessing.Process( + target=_call_single_containers_sections, args=(client, config, container_id) + ) + job.start() + jobs.append(job) + + for job in jobs: + job.join() + + +def _call_single_containers_sections(client, config, container_id): + LOGGER.info("container id: %s", container_id) + for name, section in CONTAINER_API_SECTIONS: + if is_disabled_section(config, name): + continue + try: + section(client, container_id) + except Exception as exc: + if DEBUG: + raise + report_exception_to_server(exc, section.__name__) + + agent_success = False + if not is_disabled_section(config, "docker_container_agent"): + try: + agent_success = section_container_agent(client, container_id) + except Exception as exc: + if DEBUG: + raise + report_exception_to_server(exc, "section_container_agent") + if agent_success: + return + + for name, section in CONTAINER_API_SECTIONS_NO_AGENT: + if is_disabled_section(config, name): + continue + try: + section(client, container_id) + except Exception as exc: + if DEBUG: + raise + report_exception_to_server(exc, section.__name__) + + +# . +# .--Main----------------------------------------------------------------. +# | __ __ _ | +# | | \/ | __ _(_)_ __ | +# | | |\/| |/ _` | | '_ \ | +# | | | | | (_| | | | | | | +# | |_| |_|\__,_|_|_| |_| | +# | | +# +----------------------------------------------------------------------+ +# | | +# '----------------------------------------------------------------------' + + +def main(): + + args = parse_arguments() + config = get_config(args.config_file) + + try: # first calls by docker-daemon: report failure + client = MKDockerClient(config) + except Exception as exc: + if DEBUG: + raise + report_exception_to_server(exc, "MKDockerClient.__init__") + sys.exit(0) + + set_version_info(client) + + call_node_sections(client, config) + + call_container_sections(client, config) + + +if __name__ == "__main__": + main() diff --git a/tasks/main.yml b/tasks/main.yml index e1fcaeebb1b3ffc5a2195e946115807ad13a6de9..233287c72bda92c9eaca31175849147ff429d2df 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -37,3 +37,15 @@ dest: "/etc/docker/daemon.json" mode: "0644" notify: Restart Docker + +- name: copy CheckMK-Plugin and config + ansible.builtin.copy: + src: "{{ item.src }}" + dest: "{{ item.dest }}" + mode: "{{ item.mode }}" + with_items: + - {src: "/mk_docker.py", dest: "/opt/mk_docker.py", mode: "0755"} + - {src: "/docker.cfg", dest: "/etc/check_mk/docker.cfg", mode: "0644"} + +- name: Install CheckMK-Plugin + ansible.builtin.shell: install opt/mk_docker.py /usr/lib/check_mk_agent/plugins \ No newline at end of file