monitor

Monitoring script
git clone git://git.bain.cz/monitor.git
Log | Files | Refs | README

monitor.py (7701B)


      1 #!/bin/python3
      2 # HTTP, DNS, and IP monitoring script
      3 from collections import namedtuple
      4 import time
      5 import logging
      6 import datetime
      7 import socket
      8 import json
      9 import os
     10 from typing import Callable, List
     11 
     12 import requests
     13 import pydig
     14 import git
     15 import pytz
     16 
     17 REPO_ROOT = os.getenv("STATUS_REPO", "status-repo")
     18 
     19 
     20 class Formatter(logging.Formatter):
     21     COLOR_RST = "\033[0m"
     22     COLORS = {
     23         "reset": "\033[0m",
     24         "cyan": "\033[36m",
     25         "red": "\033[31m",
     26         "boldred": "\033[1;31m",
     27         "green": "\033[32m",
     28         "blue": "\033[34m",
     29         "yellow": "\033[33m",
     30     }
     31     LOGGING_COLORS = {
     32         logging.DEBUG: "blue",
     33         logging.INFO: "green",
     34         logging.WARNING: "yellow",
     35         logging.WARN: "yellow",
     36         logging.ERROR: "red",
     37         logging.CRITICAL: "boldred",
     38     }
     39 
     40     def __init__(self, exclude_time_for: int = 1, disable_colors: bool = False) -> None:
     41         """
     42         Fancy formatter
     43 
     44         Args:
     45             exclude_time_for (int): number of seconds that must have passed
     46                 for another timestamp to be shown
     47                 max_width (int): max log width, defaults to 80 characters
     48         """
     49         super().__init__()
     50         self.last_timestamp = 0
     51         self.exclude_time_for = exclude_time_for
     52         self.disable_colors = disable_colors
     53 
     54     def c(self, color: str) -> str:
     55         if self.disable_colors is True:
     56             return ""
     57         else:
     58             return self.COLORS[color]
     59 
     60     def format(self, record: logging.LogRecord) -> str:
     61         output = ""
     62         if self.last_timestamp + self.exclude_time_for < record.created:
     63             dt = datetime.datetime.fromtimestamp(record.created)
     64             output += (
     65                 self.c("cyan") + dt.strftime("[%d/%m %H:%M:%S]") + self.COLOR_RST + " "
     66             )
     67             self.last_timestamp = record.created
     68         else:
     69             output += " " * 17
     70         output += self.c(self.LOGGING_COLORS.get(record.levelno, "reset"))
     71         output += f"{record.levelname.upper()[:3]}{self.COLOR_RST} "
     72         output += record.msg % record.args
     73         return output
     74 
     75 
     76 logger = logging.getLogger(__name__)
     77 
     78 # last states of services to keep from detecting downtime repeatedly
     79 last_states: dict[str, bool] = {}
     80 
     81 RequirementCheck = Callable[..., bool]
     82 MonitorDict = dict[str, dict[RequirementCheck, dict]]
     83 Fail = namedtuple("Fail", ("service_name", "failed_requirements"))
     84 
     85 
     86 # publish a failed service, no dependents so edit at will
     87 def fail(failed: List[Fail]):
     88     repo = git.Repo(REPO_ROOT)  # type: ignore
     89     origin = repo.remote("origin")
     90     try:
     91         origin.pull(kill_after_timeout=10)
     92     except git.CommandError:
     93         logger.error("failed to pull from origin")
     94         return
     95 
     96     for service_name, failed_requirements in failed:
     97         if not last_states.get(service_name, True):
     98             continue  # we've already seen the service down
     99         now = datetime.datetime.now(tz=pytz.timezone("Europe/Prague"))
    100         filename = f"src/content/{now.strftime('%Y-%m-%d-%f')}-downtime.md"
    101         with open(REPO_ROOT + "/" + filename, "w+") as f:
    102             lines = [
    103                 "---\n",
    104                 f"title: {service_name} downtime\n",
    105                 f"date: {now.strftime('%Y-%m-%d %H:%M:%S %z')}\n",
    106                 "severity: down\n",
    107                 "affected:\n",
    108                 f" - {service_name}\n",
    109                 "---\n",
    110                 f"Automatic checks for {service_name} have failed. "
    111                 f"Requirements {[r.__name__ for r in failed_requirements]} failed.\n",
    112             ]
    113             f.writelines(lines)
    114         repo.git.add(filename)
    115         repo.git.commit("-m", f"{service_name} downtime")
    116     try:
    117         origin.push(kill_after_timeout=10)
    118     except git.CommandError:
    119         logger.error("failed to push to origin, resetting working tree")
    120         repo.git.reset("origin/HEAD", working_tree=True)
    121     logger.info("failed services published")
    122 
    123 
    124 def self_check() -> bool:
    125     try:
    126         if requests.get("https://google.com/").status_code != 200:
    127             return False
    128     except requests.exceptions.ConnectionError:
    129         return False
    130     return True
    131 
    132 
    133 def retry(n: int = 3, sleep: int = 5) -> Callable[[RequirementCheck], RequirementCheck]:
    134     """Decorator maker for calling a function multiple times with sleep time between calls."""
    135 
    136     def inner_retry(func: RequirementCheck) -> RequirementCheck:
    137         def inner(*args, **kwargs) -> bool:
    138             passed = False
    139             for _ in range(n - 1):
    140                 passed = func(*args, **kwargs)
    141                 if passed:
    142                     break
    143                 time.sleep(sleep)
    144             return passed
    145 
    146         # preserve names in log (instead of each requirement being called "inner")
    147         inner.__name__ = func.__name__
    148         return inner
    149 
    150     return inner_retry
    151 
    152 
    153 @retry()
    154 def http_requirement(url: str, code: int) -> bool:
    155     try:
    156         resp = requests.head(url, headers={"User-agent": "monitoring (v1)"})
    157     except requests.exceptions.ConnectionError:
    158         return False
    159     else:
    160         return resp.status_code == code
    161 
    162 
    163 def dns_requirement(name: str, ip: str) -> bool:
    164     try:
    165         query = pydig.query(name, "A")
    166     except ConnectionError:
    167         return False
    168     return query is not None and (ip == "*" or ip in query)
    169 
    170 
    171 @retry()
    172 def ip_requirement(ip: str, port: int, prot: str) -> bool:
    173     protocol = socket.SOCK_STREAM if prot == "tcp" else socket.SOCK_DGRAM
    174     sock = socket.socket(type=protocol)
    175     try:
    176         sock.connect((ip, port))
    177     except ConnectionError:
    178         return False
    179     sock.close()
    180     return True
    181 
    182 
    183 def check(monitors: MonitorDict):
    184     failed_services: List[Fail] = []
    185     for service, requirements in monitors.items():
    186         logger.info(f"checking service {service}")
    187         failed = []
    188         for requirement, args in requirements.items():
    189             passed = requirement(**args)
    190             if not passed:
    191                 if not self_check():
    192                     logger.error(
    193                         "self-check failed, assuming bad connection and aborting"
    194                     )
    195                     return
    196                 logger.warning(f"  {requirement.__name__}({args})")
    197                 failed.append(requirement)
    198             time.sleep(1)
    199         if failed:
    200             failed_services.append(Fail(service, failed))
    201 
    202     if failed_services:
    203         fail(failed_services)
    204 
    205     # update last_states
    206     for service in monitors.keys():
    207         last_states[service] = True
    208     for fs in failed_services:
    209         last_states[fs.service_name] = False
    210 
    211     logger.debug("check complete")
    212 
    213 
    214 monitors_: MonitorDict = {
    215     "f.bain.cz": {
    216         http_requirement: {"url": "https://f.bain.cz/status", "code": 200},
    217         # dns_requirement: {"name": "f.bain.cz", "ip": "*"},
    218         # ip_requirement: {"ip": "f.bain.cz", "port": 80, "prot": "tcp"}
    219     },
    220     "s.bain.cz": {
    221         http_requirement: {"url": "https://s.bain.cz/", "code": 200},
    222     },
    223     "git.bain.cz": {
    224         http_requirement: {"url": "https://git.bain.cz/", "code": 200},
    225     },
    226     "ts3.bain.cz": {ip_requirement: {"ip": "ts3.bain.cz", "port": 9987, "prot": "udp"}},
    227 }
    228 
    229 if __name__ == "__main__":
    230     handler = logging.StreamHandler()
    231     handler.setFormatter(Formatter())
    232     logging.basicConfig(level=logging.INFO, handlers=[handler])
    233 
    234     # we assume this is going to be run in a cron job as the gitpython
    235     # library is slowly leaking memory apparently
    236     if os.path.exists("last-state"):
    237         with open("last-state", "r") as f:
    238             last_states = json.load(f)
    239 
    240     check(monitors_)
    241 
    242     with open("last-state", "w+") as f:
    243         json.dump(last_states, f)