scripts that log content with telegraf so we can show it in grafana. These run on the host as they need access to logs/content that is underneath the containers

This commit is contained in:
2025-08-16 10:55:39 +10:00
parent 1eb28f6975
commit 968c171dae
3 changed files with 300 additions and 0 deletions

16
telegraf-and-kuma-backups Executable file
View File

@@ -0,0 +1,16 @@
#!/bin/bash
prom_file=/srv/docker/container/telegraf/monitoring-results/backups.prom
ISSUE=`/home/ddp/bin/backup-success`
RES=$?
echo "# HELP node_backups status of last backup" > $prom_file
echo "# TYPE node_backups gauge" >> $prom_file
if [ $RES != 0 ];
then
echo "node_backups{what=\"state\",issue=\"$ISSUE\"} $RES" >> $prom_file
curl 'http://mara.ddp.net:3001/api/push/hoJD234qsx?status=down&msg=OK'
else
# need a space here to make the metric still have an issue field & so grafanas regex matches it
echo "node_backups{what=\"state\",issue=\" \"} $RES" >> $prom_file
curl 'http://mara.ddp.net:3001/api/push/hoJD234qsx?status=up&msg=OK'
fi

268
telegraf-last-docker-update.py Executable file
View File

@@ -0,0 +1,268 @@
#!/usr/bin/python3
import yaml
import subprocess
import re
import os
from datetime import datetime, timedelta, timezone
import requests
from zoneinfo import ZoneInfo
# these ports are nat'd on the modem to mara, so if a container is in host mode
# on one of these ports, or maps these potrs, then that container is internet-facing
PORT_FWDS=[ '7777', '7778', '27015', '993', '25', '465', '587' ]
# get list of running docker containers (this is done to also find true image id
docker_ps=subprocess.run( ['sudo','docker','ps'], stdout=subprocess.PIPE)
# This finds the actual img id for the container, avoids quirks I had around postgres:16 not matching for some reason
def ImageIdFor( c ):
for line in docker_ps.stdout.splitlines():
m=re.search( f" {c}'$", str(line) )
if m:
m2=re.search( r"\S+\s+(\S+)\s+", str(line) )
if m2:
return m2[1]
return c
# return True if a container is running
def IsRunning( c ):
for line in docker_ps.stdout.splitlines():
m=re.search( f" {c}'$", str(line) )
if m:
return True
return False
# Just exclude MAIA containers for now, think about this much harder another day
def Excluded( c ):
if 'MAIA_' in c:
return True
else:
return False
# see if this container either has a matching port to the list of open ports on the modem (PORT_FWDS)
# or if traefik is front-ending this, and the rule includes a Host that has depaoli.id.au on it
# if the above are true, its internet-facing, and a higher risk, so worth knowing
def external(container):
if container == 'traefik':
return True
if 'ports' in compose['services'][container]:
for port in PORT_FWDS:
if any(port in substring for substring in compose['services'][container]['ports']):
return True
if 'labels' not in compose['services'][container]:
return False
traefik_on=False
ext_host=False
for s in compose['services'][container]['labels']:
if 'traefik.enable=true' in s:
if 'true' in s:
traefik_on=True
m=re.search( r"traefik.\S+.routers.\S+.rule=\s*Host\S*\(\`(.*)\`\)", s )
if m and 'depaoli.id.au' in m[1]:
ext_host=True
if traefik_on and ext_host:
return True
if 'network_mode' in compose['services'][container]:
return "Maybe"
return False
def watchtower(container):
if 'labels' not in compose['services'][container]:
return False
for s in compose['services'][container]['labels']:
if 'watchtower' in s:
if 'true' in s:
return True
return False
# take a dt_str like 2020-12-10T09:21:12+11:00 and convert into a datetime
# then work out how many hours extra the timezone (will be 10 or 11)
# then remove the +TZ, and then add it as actual hours
# this seems mad that there is no func for this, but it works
def norm_time(dt_str):
d=datetime.strptime(dt_str, '%Y-%m-%dT%H:%M:%S%z')
return d.replace(tzinfo=None)
def norm_time_git(dt_str):
d=datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S %z')
return d.replace(tzinfo=None)
# Load the docker-compose.yaml file
with open('/srv/docker/config/docker-compose.yml') as f:
compose = yaml.safe_load(f)
# Get the container names from the docker-compose.yaml file
containers = [service_name for service_name in compose['services'].keys()]
# Load the update-docker.log file
with open('/srv/docker/log/update-docker.log') as f:
logs = f.readlines()
# Find the last log line for each container
last_logs = {}
for container in containers:
last_log = None
for log in logs:
if 'image' in compose['services'][container]:
im=compose['services'][container]['image']
else:
im=''
if im != "" and im in log:
last_log = log
last_logs[container] = last_log
# Get docker containers being monitored in kuma
f = open('/srv/docker/container/mon/monitoring-results/kuma.txt', 'r')
monitors = f.read()
# checks to see if the named container
def InKuma( container ):
if container in monitors:
return True
else:
return False
def GetLastCommmitDaysAgo( container ):
for tmp in compose['services'][container]['labels']:
m=re.search('^last.commit.url=(.+)', tmp )
if m:
# Fetch the latest commit details from GitHub API
response = requests.get(m[1])
response.raise_for_status() # Raise an error for bad status codes
commit_info = response.json()[0]
# Extract the commit URL and date
commit_url = commit_info["html_url"]
commit_date_str = commit_info["commit"]["author"]["date"]
# Convert commit date to datetime object
commit_date = datetime.fromisoformat(commit_date_str.replace('Z', '+00:00'))
# Calculate the number of days since the commit was made
current_date = datetime.now(timezone.utc)
days_ago = (current_date - commit_date).days
return days_ago
return -1
current_datetime = datetime.now()
# open file for writing prometheus formatted data into
f = open('/srv/docker/container/telegraf/monitoring-results/docker_updates.prom', 'w')
# put required help/type text in
print('# HELP node_docker_updates details of last known update of a container, whether it has a locked version (or latest tag) and whether watchtower is updating it', file=f)
print('# TYPE node_docker_updates gauge', file=f )
# Print the last log line and container name for each container
for container, last_log in last_logs.items():
# skip containers that are not running or are deliberately excluded
if not IsRunning( container ) or Excluded( container ):
continue
out_str= 'node_docker_updates{container="' + container + '"'
if 'image' in compose['services'][container]:
im=compose['services'][container]['image']
else:
im=''
out_str +=', latest_tag='
if ':' in im and ':latest' not in im and ':nightly' not in im and ':beta' not in im and im != 'php:apache':
out_str += '"no"'
else:
out_str += '"yes"'
is_built=0
out_str += ', image="'
if 'image' in compose['services'][container]:
out_str += compose['services'][container]['image']
else:
out_str += 'Built'
is_built=1
out_str += '"'
out_str += ', watchtower='
if watchtower( container ):
out_str += '"yes"'
elif is_built:
out_str += '"Built"'
else:
out_str += '"no"'
out_str += f', monitored="{InKuma(container)}"'
if 'image' in compose['services'][container]:
img_id=ImageIdFor( container )
res=subprocess.run(['sudo','docker','image','history','--human=false', img_id], stdout=subprocess.PIPE)
if res.returncode == 0:
m=re.search(r'(\d{1,4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{1,2}:\d{1,2}\+\d{1,2}:\d{1,2})', str(res.stdout, 'utf-8') )
if '0001-01-01' in m[1]:
m=re.findall( r'(\d{1,4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{1,2}:\d{1,2})', str(res.stdout, 'utf-8') )
cdate=norm_time( m[1] + '+10:00')
else:
cdate=norm_time( m[1] )
last_update=cdate
else:
last_update='No Date'
else:
os.chdir("/srv/docker/config")
res=subprocess.run(f'sudo docker compose images {container} | tail -n1', shell=True, stdout=subprocess.PIPE)
if res.returncode == 0:
m=re.search( r'(\S+)\s+(\S+)', str(res.stdout, 'utf-8') )
img_id = m[2]
res=subprocess.run(['sudo','docker','history','--human=false',img_id], stdout=subprocess.PIPE)
if res.returncode == 0:
m=re.search(r'(\d{1,4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{1,2}:\d{1,2}\+\d{1,2}:\d{1,2})', str(res.stdout, 'utf-8') )
cdate=norm_time( m[1] )
last_update=cdate
else:
last_update='No Date'
# if is_built and 'book' in container:
# res=subprocess.run(['git', '-C', '/home/ddp/src/pybook/', 'log', '--date=iso', '-n', '1', '--pretty=%ci'], stdout=subprocess.PIPE)
# last_update=norm_time_git( str(res.stdout, 'utf-8').strip() )
#
# if is_built and 'pa' in container:
# res=subprocess.run(['git', '-C', '/home/ddp/src/photoassistant/', 'log', '--date=iso', '-n', '1', '--pretty=%ci'], stdout=subprocess.PIPE)
# last_update=norm_time_git( str(res.stdout, 'utf-8').strip() )
if last_update == 'No Date':
out_str += f', age_in_days="99999"'
else:
if type(last_update) == type(current_datetime):
last_update_as_datetime = last_update
else:
last_update_as_datetime = datetime.strptime( last_update, '%Y-%m-%d %H:%M:%S' )
time_difference = current_datetime - last_update_as_datetime
out_str += f', age_in_days="{time_difference.days}"'
days_ago = GetLastCommmitDaysAgo( container )
if days_ago >= 0:
out_str += f', repo_commit_in_days="{days_ago}"'
else:
out_str += f', repo_commit_in_days="99999"'
out_str += f', last_update="{last_update}"'
out_str += f', internet_facing="{external(container)}"'
out_str += '} 1'
print( out_str, file=f )
f.close()

16
telegraf-watchtower-enabled Executable file
View File

@@ -0,0 +1,16 @@
#!/bin/bash
#
# an update, BUT, as prometheus is just looking at the last state of the file,
# it would see the last values and think they are 'current'.
#
# SO, lets date 'stamp' when we ran this, rather than give a boolean, because if
# its running this will update the date, if not, it wont anyway :)
log_file=/srv/docker/container/telegraf/monitoring-results/watchtower-enabled.influx
cnt=`egrep -v '^#' /srv/docker/config/docker-compose.yml | grep -c com.centurylinklabs.watchtower.enable`
tstamp=`date +%s%N`
# using influx format
echo mara_watchtower_enabled count=$cnt $tstamp > $log_file