From 00b3d5edb5bf10e485f4195d73c2f22d62438d21 Mon Sep 17 00:00:00 2001 From: Benjamin Dauvergne Date: Mon, 12 Aug 2019 18:17:11 +0200 Subject: [PATCH 2/2] log errors for down connectors when it persists (#35380) --- .../migrations/0014_auto_20190820_0914.py | 26 ++++ passerelle/base/models.py | 95 ++++++++++++-- tests/test_proxylogger.py | 121 ++++++++++++++++-- 3 files changed, 224 insertions(+), 18 deletions(-) create mode 100644 passerelle/base/migrations/0014_auto_20190820_0914.py diff --git a/passerelle/base/migrations/0014_auto_20190820_0914.py b/passerelle/base/migrations/0014_auto_20190820_0914.py new file mode 100644 index 00000000..cee7dc79 --- /dev/null +++ b/passerelle/base/migrations/0014_auto_20190820_0914.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.20 on 2019-08-20 14:14 +from __future__ import unicode_literals + +from django.db import migrations, models +import passerelle.base.models + + +class Migration(migrations.Migration): + + dependencies = [ + ('base', '0013_delete_templatevar'), + ] + + operations = [ + migrations.AddField( + model_name='availabilityparameters', + name='notification_delays', + field=models.TextField(default=b'0', validators=[passerelle.base.models.validate_notification_delays]), + ), + migrations.AddField( + model_name='resourcestatus', + name='last_notification_timestamp', + field=models.DateTimeField(blank=True, null=True), + ), + ] diff --git a/passerelle/base/models.py b/passerelle/base/models.py index a4630010..f240ea43 100644 --- a/passerelle/base/models.py +++ b/passerelle/base/models.py @@ -8,6 +8,7 @@ import re import sys import traceback import base64 +import itertools import django from django.apps import apps @@ -20,6 +21,7 @@ from django.test import override_settings from django.utils.text import slugify from django.utils import timezone, six from django.utils.translation import ugettext_lazy as _ +from django.utils.timezone import now from django.core.files.base import ContentFile from django.contrib.contenttypes.models import ContentType @@ -401,7 +403,8 @@ class BaseResource(models.Model): if not self.availability_parameters.run_check: return - currently_down = self.down() + availability_parameters = self.availability_parameters + try: self.check_status() status = 'up' @@ -409,26 +412,61 @@ class BaseResource(models.Model): except NotImplementedError: return except Exception as e: + from passerelle.utils.conversion import exception_to_text + status = 'down' - message = repr(e)[:500] + message = exception_to_text(e)[:500] resource_type = ContentType.objects.get_for_model(self) current_status = ResourceStatus.objects.filter( - resource_type=resource_type, - resource_pk=self.pk).first() + resource_type=resource_type, + resource_pk=self.pk).first() if not current_status or status != current_status.status: - if status == 'down' and not currently_down: - self.logger.error(u'connector "%s" (%s) is now down', self, self.__class__.__name__) + if status == 'down' and not self.down(): # new downtime + if availability_parameters.has_zero_delay(): + self.logger.error(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message) + else: + self.logger.warning(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message) ResourceStatus( resource_type=resource_type, resource_pk=self.pk, status=status, message=message).save() - if status == 'up' and currently_down: + if status == 'up' and current_status: self.logger.info(u'connector "%s" (%s) is back up', self, self.__class__.__name__) elif status == 'down': - current_status.message = message - current_status.save() + # check last_notification_downtime and current downtime to see if it matches a new notification delay + last_notification_timestamp = current_status.last_notification_timestamp or current_status.start_timestamp + + current_time = now() + downtime = (current_time - current_status.start_timestamp).total_seconds() // 60 + last_notification_downtime = ( + last_notification_timestamp - current_status.start_timestamp).total_seconds() // 60 + + for delay in availability_parameters.notification_delays_generator(): + if not delay: + continue + if downtime >= delay: + if last_notification_downtime < delay: + days = downtime // 1440 + hours = downtime // 60 + if days > 1: + human_duration = 'for %d days' % days + elif hours > 1: + human_duration = 'for %d hours' % hours + else: + human_duration = 'for %d minutes' % downtime + self.logger.error(u'connector "%s" (%s) is still down %s: %s', + self, self.__class__.__name__, + human_duration, + message, + # when connector is down, logging is shutdown + force=True) + ResourceStatus.objects.filter(pk=current_status.pk).update( + message=message, last_notification_timestamp=current_time) + break + else: + break def hourly(self): pass @@ -537,6 +575,25 @@ class LoggingParameters(models.Model): unique_together = (('resource_type', 'resource_pk')) +def parse_notification_delays(value): + delays = [int(v.strip()) for v in value.split(',')] + if not all(delay >= 0 for delay in delays): + raise ValueError + if delays != sorted(delays): + raise ValueError + if len(set(delays)) != len(delays): + raise ValueError + return delays + + +def validate_notification_delays(value): + value = value or u'' + try: + parse_notification_delays(value) + except ValueError: + raise ValidationError(_('You must provide a list of increasing integers')) + + class AvailabilityParameters(models.Model): resource_type = models.ForeignKey(ContentType) resource_pk = models.PositiveIntegerField() @@ -544,6 +601,22 @@ class AvailabilityParameters(models.Model): run_check = models.BooleanField( default=True, verbose_name=_('Run regular availability checks'), help_text=_('Run an availability check every 5 minutes')) + notification_delays = models.TextField( + default='0', + blank=False, + validators=[validate_notification_delays]) + + def has_zero_delay(self): + return 0 in parse_notification_delays(self.notification_delays) + + def notification_delays_generator(self): + notification_delays = parse_notification_delays(self.notification_delays) + last_notification_delay = notification_delays[-1] + if last_notification_delay > 1: + notification_delays = itertools.chain( + notification_delays, + itertools.count(2 * last_notification_delay, last_notification_delay)) + return notification_delays class Meta: unique_together = (('resource_type', 'resource_pk')) @@ -609,6 +682,7 @@ class ResourceStatus(models.Model): start_timestamp = models.DateTimeField(auto_now_add=True) status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='unknown') message = models.CharField(max_length=500, blank=True) + last_notification_timestamp = models.DateTimeField(blank=True, null=True) class Meta: ordering = ['-start_timestamp'] @@ -635,7 +709,8 @@ class ProxyLogger(object): return self._logger.getEffectiveLevel() def _log(self, levelname, message, *args, **kwargs): - if self.connector.down(): + force = kwargs.pop('force', False) + if self.connector.down() and not force: # don't log if the connector is known to be down return levelno = getattr(logging, levelname) diff --git a/tests/test_proxylogger.py b/tests/test_proxylogger.py index f67b672b..bb9e6dc2 100644 --- a/tests/test_proxylogger.py +++ b/tests/test_proxylogger.py @@ -1,11 +1,15 @@ # -*- coding: utf-8 -*- import logging +import datetime +import itertools import pytest from httmock import all_requests, HTTMock -from passerelle.base.models import ProxyLogger, ResourceLog +from django.core.exceptions import ValidationError + +from passerelle.base.models import ProxyLogger, ResourceLog, AvailabilityParameters from passerelle.apps.feeds.models import Feed from .test_availability import down_mock, up_mock @@ -66,20 +70,121 @@ def test_proxy_logger_ignore_when_down(db, connector): pr.debug(u'some message') assert len(ResourceLog.objects.all()) == 0 -def test_log_on_connector_availability_change(db, connector): + +def test_validate_notification_delays(db, connector): + def take(iterator, count): + return list(itertools.compress(iterator, range(1, count + 1))) + + availability_parameters = connector.availability_parameters + assert availability_parameters.notification_delays == '0' + + availability_parameters.full_clean() + assert availability_parameters.has_zero_delay() + assert list(availability_parameters.notification_delays_generator()) == [0] + + availability_parameters.notification_delays = '0,5,100,1000' + availability_parameters.full_clean() + assert availability_parameters.has_zero_delay() + assert take(availability_parameters.notification_delays_generator(), 6) == [0, 5, 100, 1000, 2000, 3000] + + availability_parameters.notification_delays = '5,100,1000' + availability_parameters.full_clean() + assert not availability_parameters.has_zero_delay() + assert take(availability_parameters.notification_delays_generator(), 6) == [5, 100, 1000, 2000, 3000, 4000] + + availability_parameters.notification_delays = '5' + availability_parameters.full_clean() + assert not availability_parameters.has_zero_delay() + assert take(availability_parameters.notification_delays_generator(), 3) == [5, 10, 15] + + # validation errors + availability_parameters.notification_delays = '5,100,100' + with pytest.raises(ValidationError): + availability_parameters.full_clean() + + availability_parameters.notification_delays = '' + with pytest.raises(ValidationError): + availability_parameters.full_clean() + + availability_parameters.notification_delays = '1,x' + with pytest.raises(ValidationError): + availability_parameters.full_clean() + + availability_parameters.notification_delays = '10,1' + with pytest.raises(ValidationError): + availability_parameters.full_clean() + + +@pytest.mark.parametrize('notification_delays', ['0', '0,5,100', '5,100']) +def test_log_on_connector_availability_change(db, connector, freezer, notification_delays): connector.title = u'éléphant' + availability_parameters = connector.availability_parameters + availability_parameters.notification_delays = notification_delays + availability_parameters.save() + with HTTMock(up_mock): # set connector as up connector.availability() + assert ResourceLog.objects.count() == 1 + assert ResourceLog.objects.latest('id').level == 'info' + assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)' ResourceLog.objects.all().delete() + + # move 5 minutes in the future + freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1)) + with HTTMock(down_mock): # set connector as down connector.availability() - assert len(ResourceLog.objects.all()) == 2 + assert connector.down() + last_count1 = ResourceLog.objects.count() + assert last_count1 == 2 assert ResourceLog.objects.all()[0].message == 'GET http://example.net/ (=> 404)' - assert ResourceLog.objects.all()[1].level == 'error' - assert ResourceLog.objects.all()[1].message == u'connector "éléphant" (Feed) is now down' + assert ResourceLog.objects.all()[1].level == 'error' if notification_delays.startswith('0') else 'warning' + assert (u'connector "éléphant" (Feed) is now down: 404 Client' + in ResourceLog.objects.all()[1].message) + + # move 5 minutes in the future + freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1)) + + # second time log as error + with HTTMock(down_mock): # connector is still down + connector.availability() + assert connector.down() + last_count2 = ResourceLog.objects.count() + if notification_delays == '0': + assert last_count2 == last_count1 + else: + assert last_count2 == last_count1 + 1 + assert ResourceLog.objects.all()[2].level == 'error' if notification_delays != '0' else 'warning' + assert (u'connector "éléphant" (Feed) is still down for 5 minutes: 404' + in ResourceLog.objects.all()[2].message) + + # move 3 minutes in the future + freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1)) + + # third time no log + with HTTMock(down_mock): # connector is still down + connector.availability() + assert ResourceLog.objects.count() == last_count2 + + # move 3 minutes in the future + freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1)) + + with HTTMock(up_mock): # set connector as up + connector.availability() + assert not connector.down() + last_count3 = ResourceLog.objects.count() + assert last_count3 == last_count2 + 1 + assert ResourceLog.objects.latest('id').level == 'info' + assert ResourceLog.objects.latest('id').message == u'connector "éléphant" (Feed) is back up' + + # move 3 minutes in the future + freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1)) + # last message is a GET with HTTMock(up_mock): # set connector as up connector.availability() - assert len(ResourceLog.objects.all()) == 3 - assert ResourceLog.objects.all()[2].level == 'info' - assert ResourceLog.objects.all()[2].message == u'connector "éléphant" (Feed) is back up' + assert not connector.down() + last_count4 = ResourceLog.objects.count() + assert last_count4 == last_count3 + 1 + assert ResourceLog.objects.latest('id').level == 'info' + assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)' -- 2.23.0.rc1