From 6187f96a98a11d103e61826368bb4645c7874a8b Mon Sep 17 00:00:00 2001 From: Benjamin Dauvergne Date: Mon, 12 Aug 2019 18:17:11 +0200 Subject: [PATCH 2/2] log errors for down connectors when it persists (#35380) --- passerelle/base/forms.py | 12 +- .../migrations/0014_auto_20190820_0914.py | 26 ++++ passerelle/base/models.py | 102 +++++++++++++-- passerelle/base/views.py | 58 ++++----- tests/test_manager.py | 15 +++ tests/test_proxylogger.py | 121 ++++++++++++++++-- tox.ini | 1 + 7 files changed, 286 insertions(+), 49 deletions(-) create mode 100644 passerelle/base/migrations/0014_auto_20190820_0914.py diff --git a/passerelle/base/forms.py b/passerelle/base/forms.py index 6621a908..86c66e07 100644 --- a/passerelle/base/forms.py +++ b/passerelle/base/forms.py @@ -1,6 +1,7 @@ from django import forms -from .models import ApiUser, AccessRight +from .models import ApiUser, AccessRight, AvailabilityParameters, parse_notification_delays + class ApiUserForm(forms.ModelForm): class Meta: @@ -17,3 +18,12 @@ class AccessRightForm(forms.ModelForm): 'resource_type': forms.HiddenInput(), 'resource_pk': forms.HiddenInput(), } + + +class AvailabilityParametersForm(forms.ModelForm): + class Meta: + model = AvailabilityParameters + fields = ['run_check', 'notification_delays'] + widgets = { + 'notification_delays': forms.TextInput, + } diff --git a/passerelle/base/migrations/0014_auto_20190820_0914.py b/passerelle/base/migrations/0014_auto_20190820_0914.py new file mode 100644 index 00000000..cee7dc79 --- /dev/null +++ b/passerelle/base/migrations/0014_auto_20190820_0914.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.20 on 2019-08-20 14:14 +from __future__ import unicode_literals + +from django.db import migrations, models +import passerelle.base.models + + +class Migration(migrations.Migration): + + dependencies = [ + ('base', '0013_delete_templatevar'), + ] + + operations = [ + migrations.AddField( + model_name='availabilityparameters', + name='notification_delays', + field=models.TextField(default=b'0', validators=[passerelle.base.models.validate_notification_delays]), + ), + migrations.AddField( + model_name='resourcestatus', + name='last_notification_timestamp', + field=models.DateTimeField(blank=True, null=True), + ), + ] diff --git a/passerelle/base/models.py b/passerelle/base/models.py index 6a9a5ec0..28b5cd40 100644 --- a/passerelle/base/models.py +++ b/passerelle/base/models.py @@ -8,6 +8,7 @@ import re import sys import traceback import base64 +import itertools from django.apps import apps from django.conf import settings @@ -19,6 +20,7 @@ from django.test import override_settings from django.utils.text import slugify from django.utils import timezone, six from django.utils.translation import ugettext_lazy as _ +from django.utils.timezone import now from django.core.files.base import ContentFile from django.contrib.contenttypes.models import ContentType @@ -400,7 +402,8 @@ class BaseResource(models.Model): if not self.availability_parameters.run_check: return - currently_down = self.down() + availability_parameters = self.availability_parameters + try: self.check_status() status = 'up' @@ -408,26 +411,66 @@ class BaseResource(models.Model): except NotImplementedError: return except Exception as e: + from passerelle.utils.conversion import exception_to_text + status = 'down' - message = repr(e)[:500] + message = exception_to_text(e)[:500] resource_type = ContentType.objects.get_for_model(self) current_status = ResourceStatus.objects.filter( - resource_type=resource_type, - resource_pk=self.pk).first() + resource_type=resource_type, + resource_pk=self.pk).first() if not current_status or status != current_status.status: - if status == 'down' and not currently_down: - self.logger.error(u'connector "%s" (%s) is now down', self, self.__class__.__name__) + if status == 'down' and not self.down(): # new downtime + if availability_parameters.has_zero_delay(): + self.logger.error(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message) + else: + self.logger.warning(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message) ResourceStatus( resource_type=resource_type, resource_pk=self.pk, status=status, message=message).save() - if status == 'up' and currently_down: + if status == 'up' and current_status: self.logger.info(u'connector "%s" (%s) is back up', self, self.__class__.__name__) elif status == 'down': - current_status.message = message - current_status.save() + # check last_notification_downtime and current downtime to see if it matches a new notification delay + last_notification_timestamp = current_status.last_notification_timestamp or current_status.start_timestamp + + current_time = now() + downtime = (current_time - current_status.start_timestamp).total_seconds() // 60 + last_notification_downtime = ( + last_notification_timestamp - current_status.start_timestamp).total_seconds() // 60 + + for delay in availability_parameters.notification_delays_generator(): + if not delay: + continue + if downtime >= delay: + if last_notification_downtime < delay: + days = downtime // 1440 + hours = downtime // 60 + if days > 1: + human_duration = 'for %d days' % days + elif hours > 1: + human_duration = 'for %d hours' % hours + else: + human_duration = 'for %d minutes' % downtime + self.logger.error(u'connector "%s" (%s) has been down %s: %s', + self, self.__class__.__name__, + human_duration, + message, + # when connector is down, logging is shutdown + force=True) + ResourceStatus.objects.filter(pk=current_status.pk).update( + message=message, last_notification_timestamp=current_time) + break + else: + ResourceStatus.objects.filter(pk=current_status.pk).update( + message=message) + break + else: + ResourceStatus.objects.filter(pk=current_status.pk).update( + message=message) def hourly(self): pass @@ -536,6 +579,26 @@ class LoggingParameters(models.Model): unique_together = (('resource_type', 'resource_pk')) +def parse_notification_delays(value): + delays = [int(v.strip()) for v in value.split(',')] + if not all(delay >= 0 for delay in delays): + raise ValueError + if delays != sorted(delays): + raise ValueError + if len(set(delays)) != len(delays): + raise ValueError + return delays + + +def validate_notification_delays(value): + try: + if not value: + raise ValueError + parse_notification_delays(value) + except ValueError: + raise ValidationError(_('You must provide a list of increasing minutes delays')) + + class AvailabilityParameters(models.Model): resource_type = models.ForeignKey(ContentType) resource_pk = models.PositiveIntegerField() @@ -543,6 +606,23 @@ class AvailabilityParameters(models.Model): run_check = models.BooleanField( default=True, verbose_name=_('Run regular availability checks'), help_text=_('Run an availability check every 5 minutes')) + notification_delays = models.TextField( + default='0', + blank=False, + validators=[validate_notification_delays], + help_text=_('Increasing delay between error notifications in minutes, ex.: 0,5,10')) + + def has_zero_delay(self): + return 0 in parse_notification_delays(self.notification_delays) + + def notification_delays_generator(self): + notification_delays = parse_notification_delays(self.notification_delays) + last_notification_delay = notification_delays[-1] + if last_notification_delay > 1: + notification_delays = itertools.chain( + notification_delays, + itertools.count(2 * last_notification_delay, last_notification_delay)) + return notification_delays class Meta: unique_together = (('resource_type', 'resource_pk')) @@ -608,6 +688,7 @@ class ResourceStatus(models.Model): start_timestamp = models.DateTimeField(auto_now_add=True) status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='unknown') message = models.CharField(max_length=500, blank=True) + last_notification_timestamp = models.DateTimeField(blank=True, null=True) class Meta: ordering = ['-start_timestamp'] @@ -634,7 +715,8 @@ class ProxyLogger(object): return self._logger.getEffectiveLevel() def _log(self, levelname, message, *args, **kwargs): - if self.connector.down(): + force = kwargs.pop('force', False) + if self.connector.down() and not force: # don't log if the connector is known to be down return levelno = getattr(logging, levelname) diff --git a/passerelle/base/views.py b/passerelle/base/views.py index 2db7be2f..0719cb36 100644 --- a/passerelle/base/views.py +++ b/passerelle/base/views.py @@ -15,15 +15,17 @@ # along with this program. If not, see . from django.contrib.contenttypes.models import ContentType +from django.contrib import messages from django.core.urlresolvers import reverse from django.core.exceptions import ObjectDoesNotExist, PermissionDenied from django.forms import models as model_forms from django.views.generic import ( DetailView, ListView, CreateView, UpdateView, DeleteView, FormView) from django.http import Http404 +from django.utils.translation import ugettext_lazy as _ -from .models import ApiUser, AccessRight, LoggingParameters, AvailabilityParameters, ResourceStatus -from .forms import ApiUserForm, AccessRightForm +from .models import ApiUser, AccessRight, LoggingParameters, ResourceStatus +from .forms import ApiUserForm, AccessRightForm, AvailabilityParametersForm from ..utils import get_trusted_services @@ -155,42 +157,33 @@ class LoggingParametersUpdateView(FormView): return super(LoggingParametersUpdateView, self).form_valid(form) -class ManageAvailabilityView(FormView): +class ManageAvailabilityView(UpdateView): template_name = 'passerelle/manage/manage_availability_form.html' + form_class = AvailabilityParametersForm + + def get_resource(self): + if not hasattr(self, '_resource'): + content_type = ContentType.objects.get_for_id(self.kwargs['resource_type']) + self._resource = content_type.model_class().objects.get(pk=self.kwargs['resource_pk']) + return self._resource + + def get_object(self, queryset=None): + return self.get_resource().availability_parameters def get_context_data(self, **kwargs): context = super(ManageAvailabilityView, self).get_context_data(**kwargs) - connector = self.get_resource() - context['connector'] = connector - context['availability_status'] = connector.get_availability_status() + context['connector'] = self.get_resource() + context['availability_status'] = self.get_resource().get_availability_status() return context - def get_form_class(self): - form_class = model_forms.modelform_factory( - AvailabilityParameters, - fields=['run_check']) - return form_class - - def get_initial(self): - d = self.initial.copy() - d['resource_type'] = self.kwargs['resource_type'] - d['resource_pk'] = self.kwargs['resource_pk'] - d['run_check'] = self.get_resource().availability_parameters.run_check - return d - - def get_resource(self): - content_type = ContentType.objects.get_for_id(self.kwargs['resource_type']) - return content_type.model_class().objects.get(pk=self.kwargs['resource_pk']) - def get_success_url(self): return self.get_resource().get_absolute_url() def form_valid(self, form): resource = self.get_resource() - parameters = resource.availability_parameters - run_check = form.cleaned_data['run_check'] - if not run_check and resource.down(): + # if availability check is disabled resource is forced to up to activate logs + if not form.instance.run_check and resource.down(): resource_type = ContentType.objects.get_for_model(resource) ResourceStatus( resource_type=resource_type, @@ -198,9 +191,14 @@ class ManageAvailabilityView(FormView): status='up', message='').save() - if parameters.run_check != run_check: - parameters.run_check = run_check - parameters.save() - resource.logger.info(u'availability checks %s', 'enabled' if run_check else 'disabled') + # log changes to notification delays + if 'notification_delays' in form.changed_data: + resource.logger.info(u'availability checks delays set to %s', form.instance.notification_delays) + + # log changes to run_check, if enabled immediately check for availability + if 'run_check' in form.changed_data: + resource.logger.info(u'availability checks %s', 'enabled' if form.instance.run_check else 'disabled') + if form.instance.run_check: + resource.availability() return super(ManageAvailabilityView, self).form_valid(form) diff --git a/tests/test_manager.py b/tests/test_manager.py index 71d93e31..3b674043 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -248,3 +248,18 @@ def test_availability_parameters(app, admin_user, monkeypatch): assert not csv.down() status = csv.get_availability_status() assert status.status == 'up' + + # Notification delays + resp = app.get(csv.get_absolute_url()) + resp = resp.click('availability check') + assert resp.form['notification_delays'].value == '0' + resp.form['notification_delays'] = '0,5,100' + resp = resp.form.submit().follow() + assert not resp.pyquery('.messages .warning') + assert csv.availability_parameters.notification_delays == '0,5,100' + + resp = app.get(csv.get_absolute_url()) + resp = resp.click('availability check') + resp.form['notification_delays'] = 'x' + resp = resp.form.submit() + assert len(resp.pyquery('#id_notification_delays_p .error')) diff --git a/tests/test_proxylogger.py b/tests/test_proxylogger.py index f67b672b..0771aa5b 100644 --- a/tests/test_proxylogger.py +++ b/tests/test_proxylogger.py @@ -1,9 +1,13 @@ # -*- coding: utf-8 -*- import logging +import datetime +import itertools import pytest -from httmock import all_requests, HTTMock +from httmock import HTTMock + +from django.core.exceptions import ValidationError from passerelle.base.models import ProxyLogger, ResourceLog from passerelle.apps.feeds.models import Feed @@ -66,20 +70,121 @@ def test_proxy_logger_ignore_when_down(db, connector): pr.debug(u'some message') assert len(ResourceLog.objects.all()) == 0 -def test_log_on_connector_availability_change(db, connector): + +def test_validate_notification_delays(db, connector): + def take(iterator, count): + return list(itertools.compress(iterator, range(1, count + 1))) + + availability_parameters = connector.availability_parameters + assert availability_parameters.notification_delays == '0' + + availability_parameters.full_clean() + assert availability_parameters.has_zero_delay() + assert list(availability_parameters.notification_delays_generator()) == [0] + + availability_parameters.notification_delays = '0,5,100,1000' + availability_parameters.full_clean() + assert availability_parameters.has_zero_delay() + assert take(availability_parameters.notification_delays_generator(), 6) == [0, 5, 100, 1000, 2000, 3000] + + availability_parameters.notification_delays = '5,100,1000' + availability_parameters.full_clean() + assert not availability_parameters.has_zero_delay() + assert take(availability_parameters.notification_delays_generator(), 6) == [5, 100, 1000, 2000, 3000, 4000] + + availability_parameters.notification_delays = '5' + availability_parameters.full_clean() + assert not availability_parameters.has_zero_delay() + assert take(availability_parameters.notification_delays_generator(), 3) == [5, 10, 15] + + # validation errors + availability_parameters.notification_delays = '5,100,100' + with pytest.raises(ValidationError): + availability_parameters.full_clean() + + availability_parameters.notification_delays = '' + with pytest.raises(ValidationError): + availability_parameters.full_clean() + + availability_parameters.notification_delays = '1,x' + with pytest.raises(ValidationError): + availability_parameters.full_clean() + + availability_parameters.notification_delays = '10,1' + with pytest.raises(ValidationError): + availability_parameters.full_clean() + + +@pytest.mark.parametrize('notification_delays', ['0', '0,5,100', '5,100']) +def test_log_on_connector_availability_change(db, connector, freezer, notification_delays): connector.title = u'éléphant' + availability_parameters = connector.availability_parameters + availability_parameters.notification_delays = notification_delays + availability_parameters.save() + with HTTMock(up_mock): # set connector as up connector.availability() + assert ResourceLog.objects.count() == 1 + assert ResourceLog.objects.latest('id').level == 'info' + assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)' ResourceLog.objects.all().delete() + + # move 5 minutes in the future + freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1)) + with HTTMock(down_mock): # set connector as down connector.availability() - assert len(ResourceLog.objects.all()) == 2 + assert connector.down() + last_count1 = ResourceLog.objects.count() + assert last_count1 == 2 assert ResourceLog.objects.all()[0].message == 'GET http://example.net/ (=> 404)' - assert ResourceLog.objects.all()[1].level == 'error' - assert ResourceLog.objects.all()[1].message == u'connector "éléphant" (Feed) is now down' + assert ResourceLog.objects.all()[1].level == 'error' if notification_delays.startswith('0') else 'warning' + assert (u'connector "éléphant" (Feed) is now down: 404 Client' + in ResourceLog.objects.all()[1].message) + + # move 5 minutes in the future + freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1)) + + # second time log as error + with HTTMock(down_mock): # connector is still down + connector.availability() + assert connector.down() + last_count2 = ResourceLog.objects.count() + if notification_delays == '0': + assert last_count2 == last_count1 + else: + assert last_count2 == last_count1 + 1 + assert ResourceLog.objects.all()[2].level == 'error' if notification_delays != '0' else 'warning' + assert (u'connector "éléphant" (Feed) is still down for 5 minutes: 404' + in ResourceLog.objects.all()[2].message) + + # move 3 minutes in the future + freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1)) + + # third time no log + with HTTMock(down_mock): # connector is still down + connector.availability() + assert ResourceLog.objects.count() == last_count2 + + # move 3 minutes in the future + freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1)) + + with HTTMock(up_mock): # set connector as up + connector.availability() + assert not connector.down() + last_count3 = ResourceLog.objects.count() + assert last_count3 == last_count2 + 1 + assert ResourceLog.objects.latest('id').level == 'info' + assert ResourceLog.objects.latest('id').message == u'connector "éléphant" (Feed) is back up' + + # move 3 minutes in the future + freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1)) + # last message is a GET with HTTMock(up_mock): # set connector as up connector.availability() - assert len(ResourceLog.objects.all()) == 3 - assert ResourceLog.objects.all()[2].level == 'info' - assert ResourceLog.objects.all()[2].message == u'connector "éléphant" (Feed) is back up' + assert not connector.down() + last_count4 = ResourceLog.objects.count() + assert last_count4 == last_count3 + 1 + assert ResourceLog.objects.latest('id').level == 'info' + assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)' diff --git a/tox.ini b/tox.ini index 45fc2bac..53b9ee4a 100644 --- a/tox.ini +++ b/tox.ini @@ -35,6 +35,7 @@ deps = http://quixote.python.ca/releases/Quixote-2.7b2.tar.gz vobject django-ratelimit + pyquery commands = ./get_wcs.sh django111: py.test {posargs: {env:FAST:} --junitxml=test_{envname}_results.xml --cov-report xml --cov-report html --cov=passerelle/ --cov-config .coveragerc tests/} -- 2.23.0