Merge pull request #838 from YunoHost/improve-service-status-checks

[enh] Improve service status checks (and service add CLI)
This commit is contained in:
Alexandre Aubin 2019-11-18 14:11:37 +01:00 committed by GitHub
commit d50a69fe3e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 146 additions and 122 deletions

View file

@ -960,24 +960,13 @@ service:
arguments: arguments:
name: name:
help: Service name to add help: Service name to add
-s: -d:
full: --status full: --description
help: Custom status command help: Description of the service
-l: -l:
full: --log full: --log
help: Absolute path to log file to display help: Absolute path to log file to display
nargs: "+" nargs: "+"
-r:
full: --runlevel
help: Runlevel priority of the service
type: int
-n:
full: --need_lock
help: Use this option to prevent deadlocks if the service does invoke yunohost commands.
action: store_true
-d:
full: --description
help: Description of the service
-t: -t:
full: --log_type full: --log_type
help: Type of the log (file or systemd) help: Type of the log (file or systemd)
@ -986,6 +975,22 @@ service:
- file - file
- systemd - systemd
default: file default: file
--test_status:
help: Specify a custom bash command to check the status of the service. Note that it only makes sense to specify this if the corresponding systemd service does not return the proper information already.
--test_conf:
help: Specify a custom bash command to check if the configuration of the service is valid or broken, similar to nginx -t.
--needs_exposed_ports:
help: A list of ports that needs to be publicly exposed for the service to work as intended.
nargs: "+"
type: int
metavar: PORT
-n:
full: --need_lock
help: Use this option to prevent deadlocks if the service does invoke yunohost commands.
action: store_true
-s:
full: --status
help: Deprecated, old option. Does nothing anymore. Possibly check the --test_status option.
### service_remove() ### service_remove()
remove: remove:

View file

@ -101,6 +101,12 @@ for service, conf in new_services.items():
if conffiles: if conffiles:
services[service]['conffiles'] = conffiles services[service]['conffiles'] = conffiles
# Remove legacy /var/log/daemon.log and /var/log/syslog from log entries
# because they are too general. Instead, now the journalctl log is
# returned by default which is more relevant.
if "log" in services[service]:
if services[service]["log"] in ["/var/log/syslog", "/var/log/daemon.log"]:
del services[service]["log"]
if updated: if updated:
with open('/etc/yunohost/services.yml-new', 'w') as f: with open('/etc/yunohost/services.yml-new', 'w') as f:

View file

@ -5,7 +5,7 @@ import requests
from yunohost.diagnosis import Diagnoser from yunohost.diagnosis import Diagnoser
from yunohost.utils.error import YunohostError from yunohost.utils.error import YunohostError
from yunohost.service import _get_services
class PortsDiagnoser(Diagnoser): class PortsDiagnoser(Diagnoser):
@ -15,16 +15,18 @@ class PortsDiagnoser(Diagnoser):
def run(self): def run(self):
# FIXME / TODO : in the future, maybe we want to report different # This dict is something like :
# things per port depending on how important they are # { 80: "nginx",
# (e.g. XMPP sounds to me much less important than other ports) # 25: "postfix",
# Ideally, a port could be related to a service... # 443: "nginx"
# FIXME / TODO : for now this list of port is hardcoded, might want # ... }
# to fetch this from the firewall.yml in /etc/yunohost/ ports = {}
ports = [22, 25, 53, 80, 443, 587, 993, 5222, 5269] for service, infos in _get_services().items():
for port in infos.get("needs_exposed_ports", []):
ports[port] = service
try: try:
r = requests.post('https://diagnosis.yunohost.org/check-ports', json={'ports': ports}, timeout=30).json() r = requests.post('https://diagnosis.yunohost.org/check-ports', json={'ports': ports.keys()}, timeout=30).json()
if "status" not in r.keys(): if "status" not in r.keys():
raise Exception("Bad syntax for response ? Raw json: %s" % str(r)) raise Exception("Bad syntax for response ? Raw json: %s" % str(r))
elif r["status"] == "error": elif r["status"] == "error":
@ -37,15 +39,17 @@ class PortsDiagnoser(Diagnoser):
except Exception as e: except Exception as e:
raise YunohostError("diagnosis_ports_could_not_diagnose", error=e) raise YunohostError("diagnosis_ports_could_not_diagnose", error=e)
for port in ports: for port, service in ports.items():
if r["ports"].get(str(port), None) is not True: if r["ports"].get(str(port), None) is not True:
yield dict(meta={"port": port}, yield dict(meta={"port": port, "needed_by": service},
status="ERROR", status="ERROR",
summary=("diagnosis_ports_unreachable", {"port": port})) summary=("diagnosis_ports_unreachable", {"port": port}),
details=[("diagnosis_ports_needed_by", (service,)), ("diagnosis_ports_forwarding_tip", ())])
else: else:
yield dict(meta={}, yield dict(meta={"port": port, "needed_by": service},
status="SUCCESS", status="SUCCESS",
summary=("diagnosis_ports_ok", {"port": port})) summary=("diagnosis_ports_ok", {"port": port}),
details=[("diagnosis_ports_needed_by", (service))])
def main(args, env, loggers): def main(args, env, loggers):

View file

@ -5,17 +5,6 @@ import os
from yunohost.diagnosis import Diagnoser from yunohost.diagnosis import Diagnoser
from yunohost.service import service_status from yunohost.service import service_status
# TODO : all these are arbitrary, should be collectively validated
services_ignored = {"glances"}
services_critical = {"dnsmasq", "fail2ban", "yunohost-firewall", "nginx", "slapd", "ssh"}
# TODO / FIXME : we should do something about this postfix thing
# The nominal value is to be "exited" ... some daemon is actually running
# in a different thread that the thing started by systemd, which is fine
# but somehow sometimes it gets killed and there's no easy way to detect it
# Just randomly restarting it will fix ths issue. We should find some trick
# to identify the PID of the process and check it's still up or idk
services_expected_to_be_exited = {"postfix", "yunohost-firewall"}
class ServicesDiagnoser(Diagnoser): class ServicesDiagnoser(Diagnoser):
id_ = os.path.splitext(os.path.basename(__file__))[0].split("-")[1] id_ = os.path.splitext(os.path.basename(__file__))[0].split("-")[1]
@ -28,23 +17,22 @@ class ServicesDiagnoser(Diagnoser):
for service, result in sorted(all_result.items()): for service, result in sorted(all_result.items()):
if service in services_ignored:
continue
item = dict(meta={"service": service}) item = dict(meta={"service": service})
expected_status = "running" if service not in services_expected_to_be_exited else "exited"
# TODO / FIXME : might also want to check that services are enabled if result["status"] != "running":
item["status"] = "ERROR"
item["summary"] = ("diagnosis_services_bad_status", {"service": service, "status": result["status"]})
if result["active"] != "active" or result["status"] != expected_status: elif result["configuration"] == "broken":
item["status"] = "WARNING" if service not in services_critical else "ERROR" item["status"] = "WARNING"
item["summary"] = ("diagnosis_services_bad_status", {"service": service, "status": result["active"] + "/" + result["status"]}) item["summary"] = ("diagnosis_services_conf_broken", {"service": service})
# TODO : could try to append the tail of the service log to the "details" key ...
else: else:
item["status"] = "SUCCESS" item["status"] = "SUCCESS"
item["summary"] = ("diagnosis_services_good_status", {"service": service, "status": result["active"] + "/" + result["status"]}) item["summary"] = ("diagnosis_services_running", {"service": service, "status": result["status"]})
if result["configuration"] == "broken":
item["details"] = [(d, tuple()) for d in result["configuration-details"]]
yield item yield item
def main(args, env, loggers): def main(args, env, loggers):

View file

@ -15,19 +15,6 @@ class RegenconfDiagnoser(Diagnoser):
def run(self): def run(self):
# nginx -t
p = subprocess.Popen("nginx -t".split(),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
out, _ = p.communicate()
if p.returncode != 0:
yield dict(meta={"test": "nginx-t"},
status="ERROR",
summary=("diagnosis_regenconf_nginx_conf_broken", {}),
details=[(out, ())]
)
regenconf_modified_files = manually_modified_files() regenconf_modified_files = manually_modified_files()
debian_modified_files = manually_modified_files_compared_to_debian_default(ignore_handled_by_regenconf=True) debian_modified_files = manually_modified_files_compared_to_debian_default(ignore_handled_by_regenconf=True)

View file

@ -1,36 +1,42 @@
nginx: avahi-daemon: {}
log: /var/log/nginx dnsmasq: {}
avahi-daemon:
log: /var/log/daemon.log
dnsmasq:
log: /var/log/daemon.log
fail2ban:
log: /var/log/fail2ban.log
dovecot: dovecot:
log: [/var/log/mail.log,/var/log/mail.err] log: [/var/log/mail.log,/var/log/mail.err]
postfix: needs_exposed_ports: [993]
log: [/var/log/mail.log,/var/log/mail.err] fail2ban:
rspamd: log: /var/log/fail2ban.log
log: /var/log/rspamd/rspamd.log
redis-server:
log: /var/log/redis/redis-server.log
mysql:
log: [/var/log/mysql.log,/var/log/mysql.err]
alternates: ['mariadb']
ssh:
log: /var/log/auth.log
metronome: metronome:
log: [/var/log/metronome/metronome.log,/var/log/metronome/metronome.err] log: [/var/log/metronome/metronome.log,/var/log/metronome/metronome.err]
slapd: needs_exposed_ports: [5222, 5269]
log: /var/log/syslog mysql:
log: [/var/log/mysql.log,/var/log/mysql.err,/var/log/mysql/error.log]
alternates: ['mariadb']
nginx:
log: /var/log/nginx
test_conf: nginx -t
needs_exposed_ports: [80, 443]
nslcd: {}
php7.0-fpm: php7.0-fpm:
log: /var/log/php7.0-fpm.log log: /var/log/php7.0-fpm.log
test_conf: php-fpm7.0 --test
postfix:
log: [/var/log/mail.log,/var/log/mail.err]
test_status: systemctl show postfix@- | grep -q "^SubState=running"
needs_exposed_ports: [25, 587]
redis-server:
log: /var/log/redis/redis-server.log
rspamd:
log: /var/log/rspamd/rspamd.log
slapd: {}
ssh:
log: /var/log/auth.log
test_conf: sshd -t
needs_exposed_ports: [22]
yunohost-api: yunohost-api:
log: /var/log/yunohost/yunohost-api.log log: /var/log/yunohost/yunohost-api.log
yunohost-firewall: yunohost-firewall:
need_lock: true need_lock: true
nslcd: test_status: iptables -S | grep "^-A INPUT" | grep " --dport" | grep -q ACCEPT
log: /var/log/syslog
glances: null glances: null
nsswitch: null nsswitch: null
ssl: null ssl: null

View file

@ -172,8 +172,9 @@
"diagnosis_dns_bad_conf": "Bad / missing DNS configuration for domain {domain} (category {category})", "diagnosis_dns_bad_conf": "Bad / missing DNS configuration for domain {domain} (category {category})",
"diagnosis_dns_missing_record": "According to the recommended DNS configuration, you should add a DNS record with type {0}, name {1} and value {2}", "diagnosis_dns_missing_record": "According to the recommended DNS configuration, you should add a DNS record with type {0}, name {1} and value {2}",
"diagnosis_dns_discrepancy": "According to the recommended DNS configuration, the value for the DNS record with type {0} and name {1} should be {2}, not {3}.", "diagnosis_dns_discrepancy": "According to the recommended DNS configuration, the value for the DNS record with type {0} and name {1} should be {2}, not {3}.",
"diagnosis_services_good_status": "Service {service} is {status} as expected!", "diagnosis_services_running": "Service {service} is running!",
"diagnosis_services_bad_status": "Service {service} is {status} :/", "diagnosis_services_conf_broken": "Configuration is broken for service {service}!",
"diagnosis_services_bad_status": "Service {service} is {status} :(",
"diagnosis_diskusage_verylow": "Storage {mountpoint} (on device {device}) has only {free_abs_GB} GB ({free_percent}%) space remaining. You should really consider cleaning up some space.", "diagnosis_diskusage_verylow": "Storage {mountpoint} (on device {device}) has only {free_abs_GB} GB ({free_percent}%) space remaining. You should really consider cleaning up some space.",
"diagnosis_diskusage_low": "Storage {mountpoint} (on device {device}) has only {free_abs_GB} GB ({free_percent}%) space remaining. Be careful.", "diagnosis_diskusage_low": "Storage {mountpoint} (on device {device}) has only {free_abs_GB} GB ({free_percent}%) space remaining. Be careful.",
"diagnosis_diskusage_ok": "Storage {mountpoint} (on device {device}) still has {free_abs_GB} GB ({free_percent}%) space left!", "diagnosis_diskusage_ok": "Storage {mountpoint} (on device {device}) still has {free_abs_GB} GB ({free_percent}%) space left!",
@ -207,6 +208,8 @@
"diagnosis_ports_could_not_diagnose": "Could not diagnose if ports are reachable from outside. Error: {error}", "diagnosis_ports_could_not_diagnose": "Could not diagnose if ports are reachable from outside. Error: {error}",
"diagnosis_ports_unreachable": "Port {port} is not reachable from outside.", "diagnosis_ports_unreachable": "Port {port} is not reachable from outside.",
"diagnosis_ports_ok": "Port {port} is reachable from outside.", "diagnosis_ports_ok": "Port {port} is reachable from outside.",
"diagnosis_ports_needed_by": "Exposing this port is needed for service {0}",
"diagnosis_ports_forwarding_tip": "To fix this issue, most probably you need to configure port forwarding on your internet router as described in https://yunohost.org/port_forwarding",
"diagnosis_http_could_not_diagnose": "Could not diagnose if domain is reachable from outside. Error: {error}", "diagnosis_http_could_not_diagnose": "Could not diagnose if domain is reachable from outside. Error: {error}",
"diagnosis_http_ok": "Domain {domain} is reachable from outside.", "diagnosis_http_ok": "Domain {domain} is reachable from outside.",
"diagnosis_http_unreachable": "Domain {domain} is unreachable through HTTP from outside.", "diagnosis_http_unreachable": "Domain {domain} is unreachable through HTTP from outside.",
@ -535,7 +538,6 @@
"service_disabled": "The '{service:s}' service was turned off", "service_disabled": "The '{service:s}' service was turned off",
"service_enable_failed": "Could not turn on the service '{service:s}'\n\nRecent service logs:{logs:s}", "service_enable_failed": "Could not turn on the service '{service:s}'\n\nRecent service logs:{logs:s}",
"service_enabled": "The '{service:s}' service was turned off", "service_enabled": "The '{service:s}' service was turned off",
"service_no_log": "No logs to display for the service '{service:s}'",
"service_regen_conf_is_deprecated": "'yunohost service regen-conf' is deprecated! Please use 'yunohost tools regen-conf' instead.", "service_regen_conf_is_deprecated": "'yunohost service regen-conf' is deprecated! Please use 'yunohost tools regen-conf' instead.",
"service_remove_failed": "Could not remove the service '{service:s}'", "service_remove_failed": "Could not remove the service '{service:s}'",
"service_removed": "'{service:s}' service removed", "service_removed": "'{service:s}' service removed",

View file

@ -2829,7 +2829,7 @@ def _assert_system_is_sane_for_app(manifest, when):
services.append("fail2ban") services.append("fail2ban")
# List services currently down and raise an exception if any are found # List services currently down and raise an exception if any are found
faulty_services = [s for s in services if service_status(s)["active"] != "active"] faulty_services = [s for s in services if service_status(s)["status"] != "running"]
if faulty_services: if faulty_services:
if when == "pre": if when == "pre":
raise YunohostError('app_action_cannot_be_ran_because_required_services_down', raise YunohostError('app_action_cannot_be_ran_because_required_services_down',

View file

@ -40,25 +40,24 @@ MOULINETTE_LOCK = "/var/run/moulinette_yunohost.lock"
logger = log.getActionLogger('yunohost.service') logger = log.getActionLogger('yunohost.service')
def service_add(name, status=None, log=None, runlevel=None, need_lock=False, description=None, log_type="file"): def service_add(name, description=None, log=None, log_type="file", test_status=None, test_conf=None, needs_exposed_ports=None, need_lock=False, status=None):
""" """
Add a custom service Add a custom service
Keyword argument: Keyword argument:
name -- Service name to add name -- Service name to add
status -- Custom status command
log -- Absolute path to log file to display
runlevel -- Runlevel priority of the service
need_lock -- Use this option to prevent deadlocks if the service does invoke yunohost commands.
description -- description of the service description -- description of the service
log_type -- Precise if the corresponding log is a file or a systemd log log -- Absolute path to log file to display
log_type -- Specify if the corresponding log is a file or a systemd log
test_status -- Specify a custom bash command to check the status of the service. N.B. : it only makes sense to specify this if the corresponding systemd service does not return the proper information.
test_conf -- Specify a custom bash command to check if the configuration of the service is valid or broken, similar to nginx -t.
needs_exposed_ports -- A list of ports that needs to be publicly exposed for the service to work as intended.
need_lock -- Use this option to prevent deadlocks if the service does invoke yunohost commands.
status -- Deprecated, doesn't do anything anymore. Use test_status instead.
""" """
services = _get_services() services = _get_services()
if not status: services[name] = {}
services[name] = {'status': 'service'}
else:
services[name] = {'status': status}
if log is not None: if log is not None:
if not isinstance(log, list): if not isinstance(log, list):
@ -77,15 +76,22 @@ def service_add(name, status=None, log=None, runlevel=None, need_lock=False, des
else: else:
raise YunohostError('service_add_failed', service=name) raise YunohostError('service_add_failed', service=name)
if description:
if runlevel is not None: services[name]['description'] = description
services[name]['runlevel'] = runlevel else:
logger.warning("/!\\ Packager ! You added a custom service without specifying a description. Please add --description to explain what the service does in a similar fashion to existing services.")
if need_lock: if need_lock:
services[name]['need_lock'] = True services[name]['need_lock'] = True
if description is not None: if test_status:
services[name]['description'] = description services[name]["test_status"] = test_status
if test_conf:
services[name]["test_conf"] = test_conf
if needs_exposed_ports:
services[name]["needs_exposed_ports"] = needs_exposed_ports
try: try:
_save_services(services) _save_services(services)
@ -277,7 +283,7 @@ def service_status(names=[]):
# the hack was to add fake services... # the hack was to add fake services...
# we need to extract regenconf from service at some point, also because # we need to extract regenconf from service at some point, also because
# some app would really like to use it # some app would really like to use it
if "status" in services[name] and services[name]["status"] is None: if services[name].get("status", "") is None:
continue continue
status = _get_service_information_from_systemd(name) status = _get_service_information_from_systemd(name)
@ -292,11 +298,10 @@ def service_status(names=[]):
logger.error("Failed to get status information via dbus for service %s, systemctl didn't recognize this service ('NoSuchUnit')." % name) logger.error("Failed to get status information via dbus for service %s, systemctl didn't recognize this service ('NoSuchUnit')." % name)
result[name] = { result[name] = {
'status': "unknown", 'status': "unknown",
'loaded': "unknown", 'start_on_boot': "unknown",
'active': "unknown", 'last_state_change': "unknown",
'active_at': "unknown",
'description': "Error: failed to get information for this service, it doesn't exists for systemd", 'description': "Error: failed to get information for this service, it doesn't exists for systemd",
'service_file_path': "unknown", 'configuration': "unknown",
} }
else: else:
@ -314,21 +319,38 @@ def service_status(names=[]):
result[name] = { result[name] = {
'status': str(status.get("SubState", "unknown")), 'status': str(status.get("SubState", "unknown")),
'loaded': str(status.get("UnitFileState", "unknown")), 'start_on_boot': str(status.get("UnitFileState", "unknown")),
'active': str(status.get("ActiveState", "unknown")), 'last_state_change': "unknown",
'description': description, 'description': description,
'service_file_path': str(status.get("FragmentPath", "unknown")), 'configuration': "unknown",
} }
# Fun stuff™ : to obtain the enabled/disabled status for sysv services, # Fun stuff™ : to obtain the enabled/disabled status for sysv services,
# gotta do this ... cf code of /lib/systemd/systemd-sysv-install # gotta do this ... cf code of /lib/systemd/systemd-sysv-install
if result[name]["loaded"] == "generated": if result[name]["start_on_boot"] == "generated":
result[name]["loaded"] = "enabled" if glob("/etc/rc[S5].d/S??"+name) else "disabled" result[name]["start_on_boot"] = "enabled" if glob("/etc/rc[S5].d/S??"+name) else "disabled"
if "ActiveEnterTimestamp" in status: if "StateChangeTimestamp" in status:
result[name]['active_at'] = datetime.utcfromtimestamp(status["ActiveEnterTimestamp"] / 1000000) result[name]['last_state_change'] = datetime.utcfromtimestamp(status["StateChangeTimestamp"] / 1000000)
else:
result[name]['active_at'] = "unknown" # 'test_status' is an optional field to test the status of the service using a custom command
if "test_status" in services[name]:
status = os.system(services[name]["test_status"])
result[name]["status"] = "running" if status == 0 else "failed"
# 'test_status' is an optional field to test the status of the service using a custom command
if "test_conf" in services[name]:
p = subprocess.Popen(services[name]["test_conf"],
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
out, _ = p.communicate()
if p.returncode == 0:
result[name]["configuration"] = "valid"
else:
result[name]["configuration"] = "broken"
result[name]["configuration-details"] = out.strip().split("\n")
if len(names) == 1: if len(names) == 1:
return result[names[0]] return result[names[0]]
@ -373,10 +395,7 @@ def service_log(name, number=50):
if name not in services.keys(): if name not in services.keys():
raise YunohostError('service_unknown', service=name) raise YunohostError('service_unknown', service=name)
if 'log' not in services[name]: log_list = services[name].get('log', [])
raise YunohostError('service_no_log', service=name)
log_list = services[name]['log']
log_type_list = services[name].get('log_type', []) log_type_list = services[name].get('log_type', [])
if not isinstance(log_list, list): if not isinstance(log_list, list):
@ -386,6 +405,13 @@ def service_log(name, number=50):
result = {} result = {}
# First we always add the logs from journalctl / systemd
result["journalctl"] = _get_journalctl_logs(name, int(number)).splitlines()
# Mysql and journalctl are fucking annoying, we gotta explictly fetch mariadb ...
if name == "mysql":
result["journalctl"] = _get_journalctl_logs("mariadb", int(number)).splitlines()
for index, log_path in enumerate(log_list): for index, log_path in enumerate(log_list):
log_type = log_type_list[index] log_type = log_type_list[index]