monitor

Monitoring script
git clone git://git.bain.cz/monitor.git
Log | Files | Refs | README

commit 2a924f8266b847dcfa3cd7325cb46aec59674afe
parent 3f4faf42e3b4a916b60a4aa39561ad64e4b0022c
Author: bain <bain@bain.cz>
Date:   Sat, 20 Aug 2022 16:28:35 +0200

add better logs; one push per run

Diffstat:
Mmonitor.py | 170++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
Mrequirements.txt | 3+--
2 files changed, 121 insertions(+), 52 deletions(-)

diff --git a/monitor.py b/monitor.py @@ -1,68 +1,124 @@ #!/bin/python3 # HTTP, DNS, and IP monitoring script +from collections import namedtuple import time import logging import datetime import socket import json import os -from typing import Callable +from typing import Callable, List import requests import pydig import git import pytz -logger = logging.getLogger(__name__) -logger.addHandler(logging.StreamHandler()) -logger.setLevel(logging.DEBUG) +REPO_ROOT = os.getenv("STATUS_REPO", "status-repo") + + +class Formatter(logging.Formatter): + COLOR_RST = "\033[0m" + COLORS = { + "reset": "\033[0m", + "cyan": "\033[36m", + "red": "\033[31m", + "boldred": "\033[1;31m", + "green": "\033[32m", + "blue": "\033[34m", + "yellow": "\033[33m", + } + LOGGING_COLORS = { + logging.DEBUG: "blue", + logging.INFO: "green", + logging.WARNING: "yellow", + logging.WARN: "yellow", + logging.ERROR: "red", + logging.CRITICAL: "boldred", + } + + def __init__(self, exclude_time_for: int = 1, disable_colors: bool = False) -> None: + """ + Fancy formatter + + Args: + exclude_time_for (int): number of seconds that must have passed + for another timestamp to be shown + max_width (int): max log width, defaults to 80 characters + """ + super().__init__() + self.last_timestamp = 0 + self.exclude_time_for = exclude_time_for + self.disable_colors = disable_colors + + def c(self, color: str) -> str: + if self.disable_colors is True: + return "" + else: + return self.COLORS[color] + + def format(self, record: logging.LogRecord) -> str: + output = "" + if self.last_timestamp + self.exclude_time_for < record.created: + dt = datetime.datetime.fromtimestamp(record.created) + output += ( + self.c("cyan") + dt.strftime("[%d/%m %H:%M:%S]") + self.COLOR_RST + " " + ) + self.last_timestamp = record.created + else: + output += " " * 17 + output += self.c(self.LOGGING_COLORS.get(record.levelno, "reset")) + output += f"{record.levelname.upper()[:3]}{self.COLOR_RST} " + output += record.msg % record.args + return output -REPO_ROOT = "status-repo" + +logger = logging.getLogger(__name__) # last states of services to keep from detecting downtime repeatedly last_states: dict[str, bool] = {} RequirementCheck = Callable[..., bool] MonitorDict = dict[str, dict[RequirementCheck, dict]] +Fail = namedtuple("Fail", ("service_name", "failed_requirements")) # publish a failed service, no dependents so edit at will -def fail(service_name: str, failed_requirements: list): - if not last_states.get(service_name, True): - return - now = datetime.datetime.now(tz=pytz.timezone("Europe/Prague")) - filename = f"src/content/{now.strftime('%Y-%m-%d-%f')}-downtime.md" - repo = git.Repo(REPO_ROOT) - origin = repo.remote('origin') +def fail(failed: List[Fail]): + repo = git.Repo(REPO_ROOT) # type: ignore + origin = repo.remote("origin") try: origin.pull(kill_after_timeout=10) - except git.exc.CommandError: - logger.warning("Failed to pull from origin! Aborting!") + except git.CommandError: + logger.error("failed to pull from origin") return - # noinspection PyShadowingNames - with open(REPO_ROOT + "/" + filename, 'w+') as f: - lines = [ - "---\n", - f"title: {service_name} downtime\n", - f"date: {now.strftime('%Y-%m-%d %H:%M:%S %z')}\n", - "severity: down\n", - "affected:\n", - f" - {service_name}\n", - "---\n", - f"Automatic checks for {service_name} have failed. " - f"Requirements {[r.__name__ for r in failed_requirements]} failed.\n" - ] - f.writelines(lines) - repo.git.add(filename) - repo.git.commit('-m', f'{service_name} downtime') + for service_name, failed_requirements in failed: + if not last_states.get(service_name, True): + continue # we've already seen the service down + now = datetime.datetime.now(tz=pytz.timezone("Europe/Prague")) + filename = f"src/content/{now.strftime('%Y-%m-%d-%f')}-downtime.md" + with open(REPO_ROOT + "/" + filename, "w+") as f: + lines = [ + "---\n", + f"title: {service_name} downtime\n", + f"date: {now.strftime('%Y-%m-%d %H:%M:%S %z')}\n", + "severity: down\n", + "affected:\n", + f" - {service_name}\n", + "---\n", + f"Automatic checks for {service_name} have failed. " + f"Requirements {[r.__name__ for r in failed_requirements]} failed.\n", + ] + f.writelines(lines) + repo.git.add(filename) + repo.git.commit("-m", f"{service_name} downtime") try: origin.push(kill_after_timeout=10) - except git.exc.CommandError: - logger.warning("Push to origin failed! Aborting and resetting!") + except git.CommandError: + logger.error("failed to push to origin, resetting working tree") repo.git.reset("origin/HEAD", working_tree=True) - - logger.warning(f"service {service_name} failed {[r.__name__ for r in failed_requirements]}") + logger.info("failed services published") def self_check() -> bool: @@ -80,14 +136,15 @@ def retry(n: int = 3, sleep: int = 5) -> Callable[[RequirementCheck], Requiremen def inner_retry(func: RequirementCheck) -> RequirementCheck: def inner(*args, **kwargs) -> bool: passed = False - for i in range(n - 1): + for _ in range(n - 1): passed = func(*args, **kwargs) if passed: break time.sleep(sleep) return passed - inner.__name__ = func.__name__ # preserve names in log (instead of each requirement being called "inner") + # preserve names in log (instead of each requirement being called "inner") + inner.__name__ = func.__name__ return inner return inner_retry @@ -96,7 +153,7 @@ def retry(n: int = 3, sleep: int = 5) -> Callable[[RequirementCheck], Requiremen @retry() def http_requirement(url: str, code: int) -> bool: try: - resp = requests.head(url) + resp = requests.head(url, headers={"User-agent": "monitoring (v1)"}) except requests.exceptions.ConnectionError: return False else: @@ -108,7 +165,7 @@ def dns_requirement(name: str, ip: str) -> bool: query = pydig.query(name, "A") except ConnectionError: return False - return query and (ip == "*" or ip in query) + return query is not None and (ip == "*" or ip in query) @retry() @@ -124,22 +181,33 @@ def ip_requirement(ip: str, port: int, prot: str) -> bool: def check(monitors: MonitorDict): + failed_services: List[Fail] = [] for service, requirements in monitors.items(): - logger.debug(f"Checking service {service}") + logger.info(f"checking service {service}") failed = [] for requirement, args in requirements.items(): - logger.debug(f" checking requirement {requirement.__name__}") passed = requirement(**args) if not passed: if not self_check(): - logger.warning("Self-check failed, assuming bad connection and aborting") + logger.error( + "self-check failed, assuming bad connection and aborting" + ) return - logger.info(f"{service} failed requirement {requirement.__name__}") + logger.warning(f" {requirement.__name__}({args})") failed.append(requirement) time.sleep(1) if failed: - fail(service, failed) - last_states[service] = len(failed) == 0 + failed_services.append(Fail(service, failed)) + + if failed_services: + fail(failed_services) + + # update last_states + for service in monitors.keys(): + last_states[service] = True + for fs in failed_services: + last_states[fs.service_name] = False + logger.debug("check complete") @@ -155,19 +223,21 @@ monitors_: MonitorDict = { "git.bain.cz": { http_requirement: {"url": "https://git.bain.cz/", "code": 200}, }, - "ts3.bain.cz": { - ip_requirement: {"ip": "ts3.bain.cz", "port": 9987, "prot": "udp"} - } + "ts3.bain.cz": {ip_requirement: {"ip": "ts3.bain.cz", "port": 9987, "prot": "udp"}}, } -if __name__ == '__main__': +if __name__ == "__main__": + handler = logging.StreamHandler() + handler.setFormatter(Formatter()) + logging.basicConfig(level=logging.INFO, handlers=[handler]) + # we assume this is going to be run in a cron job as the gitpython # library is slowly leaking memory apparently if os.path.exists("last-state"): - with open("last-state", 'r') as f: + with open("last-state", "r") as f: last_states = json.load(f) check(monitors_) - with open("last-state", 'w+') as f: + with open("last-state", "w+") as f: json.dump(last_states, f) diff --git a/requirements.txt b/requirements.txt @@ -1,4 +1,4 @@ requests pydig gitpython -pytz -\ No newline at end of file +pytz