From 968c171dae7767d1c1492633d263c9123f29b448 Mon Sep 17 00:00:00 2001 From: Damien De Paoli Date: Sat, 16 Aug 2025 10:55:39 +1000 Subject: [PATCH] scripts that log content with telegraf so we can show it in grafana. These run on the host as they need access to logs/content that is underneath the containers --- telegraf-and-kuma-backups | 16 ++ telegraf-last-docker-update.py | 268 +++++++++++++++++++++++++++++++++ telegraf-watchtower-enabled | 16 ++ 3 files changed, 300 insertions(+) create mode 100755 telegraf-and-kuma-backups create mode 100755 telegraf-last-docker-update.py create mode 100755 telegraf-watchtower-enabled diff --git a/telegraf-and-kuma-backups b/telegraf-and-kuma-backups new file mode 100755 index 0000000..6e99941 --- /dev/null +++ b/telegraf-and-kuma-backups @@ -0,0 +1,16 @@ +#!/bin/bash + +prom_file=/srv/docker/container/telegraf/monitoring-results/backups.prom +ISSUE=`/home/ddp/bin/backup-success` +RES=$? +echo "# HELP node_backups status of last backup" > $prom_file +echo "# TYPE node_backups gauge" >> $prom_file +if [ $RES != 0 ]; +then + echo "node_backups{what=\"state\",issue=\"$ISSUE\"} $RES" >> $prom_file + curl 'http://mara.ddp.net:3001/api/push/hoJD234qsx?status=down&msg=OK' +else + # need a space here to make the metric still have an issue field & so grafanas regex matches it + echo "node_backups{what=\"state\",issue=\" \"} $RES" >> $prom_file + curl 'http://mara.ddp.net:3001/api/push/hoJD234qsx?status=up&msg=OK' +fi diff --git a/telegraf-last-docker-update.py b/telegraf-last-docker-update.py new file mode 100755 index 0000000..44afdf1 --- /dev/null +++ b/telegraf-last-docker-update.py @@ -0,0 +1,268 @@ +#!/usr/bin/python3 + +import yaml +import subprocess +import re +import os +from datetime import datetime, timedelta, timezone +import requests + +from zoneinfo import ZoneInfo + +# these ports are nat'd on the modem to mara, so if a container is in host mode +# on one of these ports, or maps these potrs, then that container is internet-facing +PORT_FWDS=[ '7777', '7778', '27015', '993', '25', '465', '587' ] + +# get list of running docker containers (this is done to also find true image id +docker_ps=subprocess.run( ['sudo','docker','ps'], stdout=subprocess.PIPE) + +# This finds the actual img id for the container, avoids quirks I had around postgres:16 not matching for some reason +def ImageIdFor( c ): + for line in docker_ps.stdout.splitlines(): + m=re.search( f" {c}'$", str(line) ) + if m: + m2=re.search( r"\S+\s+(\S+)\s+", str(line) ) + if m2: + return m2[1] + return c + + +# return True if a container is running +def IsRunning( c ): + for line in docker_ps.stdout.splitlines(): + m=re.search( f" {c}'$", str(line) ) + if m: + return True + return False + + +# Just exclude MAIA containers for now, think about this much harder another day +def Excluded( c ): + if 'MAIA_' in c: + return True + else: + return False + + +# see if this container either has a matching port to the list of open ports on the modem (PORT_FWDS) +# or if traefik is front-ending this, and the rule includes a Host that has depaoli.id.au on it +# if the above are true, its internet-facing, and a higher risk, so worth knowing +def external(container): + if container == 'traefik': + return True + + if 'ports' in compose['services'][container]: + for port in PORT_FWDS: + if any(port in substring for substring in compose['services'][container]['ports']): + return True + + if 'labels' not in compose['services'][container]: + return False + + traefik_on=False + ext_host=False + for s in compose['services'][container]['labels']: + if 'traefik.enable=true' in s: + if 'true' in s: + traefik_on=True + m=re.search( r"traefik.\S+.routers.\S+.rule=\s*Host\S*\(\`(.*)\`\)", s ) + if m and 'depaoli.id.au' in m[1]: + ext_host=True + if traefik_on and ext_host: + return True + + if 'network_mode' in compose['services'][container]: + return "Maybe" + + return False + +def watchtower(container): + if 'labels' not in compose['services'][container]: + return False + for s in compose['services'][container]['labels']: + if 'watchtower' in s: + if 'true' in s: + return True + return False + + +# take a dt_str like 2020-12-10T09:21:12+11:00 and convert into a datetime +# then work out how many hours extra the timezone (will be 10 or 11) +# then remove the +TZ, and then add it as actual hours +# this seems mad that there is no func for this, but it works +def norm_time(dt_str): + d=datetime.strptime(dt_str, '%Y-%m-%dT%H:%M:%S%z') + return d.replace(tzinfo=None) + + +def norm_time_git(dt_str): + d=datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S %z') + return d.replace(tzinfo=None) + +# Load the docker-compose.yaml file +with open('/srv/docker/config/docker-compose.yml') as f: + compose = yaml.safe_load(f) + +# Get the container names from the docker-compose.yaml file +containers = [service_name for service_name in compose['services'].keys()] + + +# Load the update-docker.log file +with open('/srv/docker/log/update-docker.log') as f: + logs = f.readlines() + +# Find the last log line for each container +last_logs = {} +for container in containers: + last_log = None + for log in logs: + if 'image' in compose['services'][container]: + im=compose['services'][container]['image'] + else: + im='' + if im != "" and im in log: + last_log = log + last_logs[container] = last_log + + + # Get docker containers being monitored in kuma + f = open('/srv/docker/container/mon/monitoring-results/kuma.txt', 'r') + monitors = f.read() + + +# checks to see if the named container +def InKuma( container ): + if container in monitors: + return True + else: + return False + +def GetLastCommmitDaysAgo( container ): + for tmp in compose['services'][container]['labels']: + m=re.search('^last.commit.url=(.+)', tmp ) + if m: + # Fetch the latest commit details from GitHub API + response = requests.get(m[1]) + response.raise_for_status() # Raise an error for bad status codes + commit_info = response.json()[0] + + # Extract the commit URL and date + commit_url = commit_info["html_url"] + commit_date_str = commit_info["commit"]["author"]["date"] + + # Convert commit date to datetime object + commit_date = datetime.fromisoformat(commit_date_str.replace('Z', '+00:00')) + + # Calculate the number of days since the commit was made + current_date = datetime.now(timezone.utc) + days_ago = (current_date - commit_date).days + + return days_ago + return -1 + +current_datetime = datetime.now() + +# open file for writing prometheus formatted data into +f = open('/srv/docker/container/telegraf/monitoring-results/docker_updates.prom', 'w') + +# put required help/type text in +print('# HELP node_docker_updates details of last known update of a container, whether it has a locked version (or latest tag) and whether watchtower is updating it', file=f) +print('# TYPE node_docker_updates gauge', file=f ) + + +# Print the last log line and container name for each container +for container, last_log in last_logs.items(): + # skip containers that are not running or are deliberately excluded + if not IsRunning( container ) or Excluded( container ): + continue + + out_str= 'node_docker_updates{container="' + container + '"' + if 'image' in compose['services'][container]: + im=compose['services'][container]['image'] + else: + im='' + + out_str +=', latest_tag=' + if ':' in im and ':latest' not in im and ':nightly' not in im and ':beta' not in im and im != 'php:apache': + out_str += '"no"' + else: + out_str += '"yes"' + + is_built=0 + out_str += ', image="' + if 'image' in compose['services'][container]: + out_str += compose['services'][container]['image'] + else: + out_str += 'Built' + is_built=1 + out_str += '"' + + out_str += ', watchtower=' + if watchtower( container ): + out_str += '"yes"' + elif is_built: + out_str += '"Built"' + else: + out_str += '"no"' + + out_str += f', monitored="{InKuma(container)}"' + + if 'image' in compose['services'][container]: + img_id=ImageIdFor( container ) + res=subprocess.run(['sudo','docker','image','history','--human=false', img_id], stdout=subprocess.PIPE) + if res.returncode == 0: + m=re.search(r'(\d{1,4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{1,2}:\d{1,2}\+\d{1,2}:\d{1,2})', str(res.stdout, 'utf-8') ) + if '0001-01-01' in m[1]: + m=re.findall( r'(\d{1,4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{1,2}:\d{1,2})', str(res.stdout, 'utf-8') ) + cdate=norm_time( m[1] + '+10:00') + else: + cdate=norm_time( m[1] ) + last_update=cdate + else: + last_update='No Date' + else: + os.chdir("/srv/docker/config") + res=subprocess.run(f'sudo docker compose images {container} | tail -n1', shell=True, stdout=subprocess.PIPE) + if res.returncode == 0: + m=re.search( r'(\S+)\s+(\S+)', str(res.stdout, 'utf-8') ) + img_id = m[2] + + res=subprocess.run(['sudo','docker','history','--human=false',img_id], stdout=subprocess.PIPE) + if res.returncode == 0: + m=re.search(r'(\d{1,4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{1,2}:\d{1,2}\+\d{1,2}:\d{1,2})', str(res.stdout, 'utf-8') ) + cdate=norm_time( m[1] ) + last_update=cdate + else: + last_update='No Date' + +# if is_built and 'book' in container: +# res=subprocess.run(['git', '-C', '/home/ddp/src/pybook/', 'log', '--date=iso', '-n', '1', '--pretty=%ci'], stdout=subprocess.PIPE) +# last_update=norm_time_git( str(res.stdout, 'utf-8').strip() ) +# +# if is_built and 'pa' in container: +# res=subprocess.run(['git', '-C', '/home/ddp/src/photoassistant/', 'log', '--date=iso', '-n', '1', '--pretty=%ci'], stdout=subprocess.PIPE) +# last_update=norm_time_git( str(res.stdout, 'utf-8').strip() ) + + if last_update == 'No Date': + out_str += f', age_in_days="99999"' + else: + if type(last_update) == type(current_datetime): + last_update_as_datetime = last_update + else: + last_update_as_datetime = datetime.strptime( last_update, '%Y-%m-%d %H:%M:%S' ) + time_difference = current_datetime - last_update_as_datetime + out_str += f', age_in_days="{time_difference.days}"' + + days_ago = GetLastCommmitDaysAgo( container ) + if days_ago >= 0: + out_str += f', repo_commit_in_days="{days_ago}"' + else: + out_str += f', repo_commit_in_days="99999"' + + out_str += f', last_update="{last_update}"' + out_str += f', internet_facing="{external(container)}"' + out_str += '} 1' + + print( out_str, file=f ) + +f.close() diff --git a/telegraf-watchtower-enabled b/telegraf-watchtower-enabled new file mode 100755 index 0000000..ea93a67 --- /dev/null +++ b/telegraf-watchtower-enabled @@ -0,0 +1,16 @@ +#!/bin/bash + +# +# an update, BUT, as prometheus is just looking at the last state of the file, +# it would see the last values and think they are 'current'. +# +# SO, lets date 'stamp' when we ran this, rather than give a boolean, because if +# its running this will update the date, if not, it wont anyway :) + +log_file=/srv/docker/container/telegraf/monitoring-results/watchtower-enabled.influx +cnt=`egrep -v '^#' /srv/docker/config/docker-compose.yml | grep -c com.centurylinklabs.watchtower.enable` + +tstamp=`date +%s%N` +# using influx format +echo mara_watchtower_enabled count=$cnt $tstamp > $log_file +