From bfcb5d6f3ec68486ee1bb511f9ef2ff468be86d2 Mon Sep 17 00:00:00 2001 From: Benjamin Dauvergne Date: Mon, 12 Aug 2019 18:17:11 +0200 Subject: [PATCH 2/2] log errors for down connectors when it persists (#35380) --- .../0014_resourcestatus_counts_checked.py | 20 ++++++++++ passerelle/base/models.py | 38 ++++++++++++++++--- tests/test_proxylogger.py | 34 ++++++++++++++--- 3 files changed, 80 insertions(+), 12 deletions(-) create mode 100644 passerelle/base/migrations/0014_resourcestatus_counts_checked.py diff --git a/passerelle/base/migrations/0014_resourcestatus_counts_checked.py b/passerelle/base/migrations/0014_resourcestatus_counts_checked.py new file mode 100644 index 00000000..5ed7c2ce --- /dev/null +++ b/passerelle/base/migrations/0014_resourcestatus_counts_checked.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.20 on 2019-08-12 16:17 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('base', '0013_delete_templatevar'), + ] + + operations = [ + migrations.AddField( + model_name='resourcestatus', + name='counts_checked', + field=models.PositiveIntegerField(default=1), + ), + ] diff --git a/passerelle/base/models.py b/passerelle/base/models.py index a4630010..9073189d 100644 --- a/passerelle/base/models.py +++ b/passerelle/base/models.py @@ -15,11 +15,13 @@ from django.conf import settings from django.core.exceptions import ValidationError, ObjectDoesNotExist, PermissionDenied from django.core.urlresolvers import reverse from django.db import connection, models, transaction -from django.db.models import Q +from django.db.models import Q, F from django.test import override_settings from django.utils.text import slugify from django.utils import timezone, six from django.utils.translation import ugettext_lazy as _ +from django.utils.encoding import force_text +from django.utils.timezone import now from django.core.files.base import ContentFile from django.contrib.contenttypes.models import ContentType @@ -409,8 +411,10 @@ class BaseResource(models.Model): except NotImplementedError: return except Exception as e: + from passerelle.utils.conversion import exception_to_text + status = 'down' - message = repr(e)[:500] + message = exception_to_text(e)[:500] resource_type = ContentType.objects.get_for_model(self) current_status = ResourceStatus.objects.filter( @@ -418,7 +422,7 @@ class BaseResource(models.Model): resource_pk=self.pk).first() if not current_status or status != current_status.status: if status == 'down' and not currently_down: - self.logger.error(u'connector "%s" (%s) is now down', self, self.__class__.__name__) + self.logger.warning(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message) ResourceStatus( resource_type=resource_type, resource_pk=self.pk, @@ -427,8 +431,29 @@ class BaseResource(models.Model): if status == 'up' and currently_down: self.logger.info(u'connector "%s" (%s) is back up', self, self.__class__.__name__) elif status == 'down': - current_status.message = message - current_status.save() + # availability cron is run every 5 minutes + # so we will get a mail 5, 50, 500, 1500, 3000 then every 5000 + # minutes after the first down event + if (current_status.counts_checked in (1, 10, 100, 300, 600, 1000) + or current_status.counts_checked % 1000 == 0): + duration = now() - current_status.start_timestamp + days = duration.total_seconds() // 86400 + hours = duration.total_seconds() // 3600 + minutes = duration.total_seconds() // 60 + if days > 1: + human_duration = 'for %d days' % days + elif hours > 1: + human_duration = 'for %d hours' % hours + else: + human_duration = 'for %d minutes' % minutes + self.logger.error(u'connector "%s" (%s) is still down %s: %s', + self, self.__class__.__name__, + human_duration, + message, + # when connector is down, logging is shutdown + force=True) + ResourceStatus.objects.filter(pk=current_status.pk).update( + message=message, counts_checked=F('counts_checked') + 1) def hourly(self): pass @@ -609,6 +634,7 @@ class ResourceStatus(models.Model): start_timestamp = models.DateTimeField(auto_now_add=True) status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='unknown') message = models.CharField(max_length=500, blank=True) + counts_checked = models.PositiveIntegerField(default=1) class Meta: ordering = ['-start_timestamp'] @@ -635,7 +661,7 @@ class ProxyLogger(object): return self._logger.getEffectiveLevel() def _log(self, levelname, message, *args, **kwargs): - if self.connector.down(): + if self.connector.down() and not kwargs.pop('force', False): # don't log if the connector is known to be down return levelno = getattr(logging, levelname) diff --git a/tests/test_proxylogger.py b/tests/test_proxylogger.py index f67b672b..0aa4450e 100644 --- a/tests/test_proxylogger.py +++ b/tests/test_proxylogger.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import logging +import datetime import pytest from httmock import all_requests, HTTMock @@ -66,20 +67,41 @@ def test_proxy_logger_ignore_when_down(db, connector): pr.debug(u'some message') assert len(ResourceLog.objects.all()) == 0 -def test_log_on_connector_availability_change(db, connector): +def test_log_on_connector_availability_change(db, connector, freezer): connector.title = u'éléphant' with HTTMock(up_mock): # set connector as up connector.availability() ResourceLog.objects.all().delete() + + # move 5 minutes in the future + freezer.move_to(datetime.timedelta(seconds=60 * 5)) + with HTTMock(down_mock): # set connector as down connector.availability() assert len(ResourceLog.objects.all()) == 2 assert ResourceLog.objects.all()[0].message == 'GET http://example.net/ (=> 404)' - assert ResourceLog.objects.all()[1].level == 'error' - assert ResourceLog.objects.all()[1].message == u'connector "éléphant" (Feed) is now down' + assert ResourceLog.objects.all()[1].level == 'warning' + assert (u'connector "éléphant" (Feed) is now down: 404 Client' + in ResourceLog.objects.all()[1].message) - with HTTMock(up_mock): # set connector as up + # move 5 minutes in the future + freezer.move_to(datetime.timedelta(seconds=60 * 5)) + + # second time log as error + with HTTMock(down_mock): # connector is still down + connector.availability() + assert len(ResourceLog.objects.all()) == 3 + assert ResourceLog.objects.all()[2].level == 'error' + assert (u'connector "éléphant" (Feed) is still down for 5 minutes: 404' + in ResourceLog.objects.all()[2].message) + + # third time no log + with HTTMock(down_mock): # connector is still down connector.availability() assert len(ResourceLog.objects.all()) == 3 - assert ResourceLog.objects.all()[2].level == 'info' - assert ResourceLog.objects.all()[2].message == u'connector "éléphant" (Feed) is back up' + + with HTTMock(up_mock): # set connector as up + connector.availability() + assert len(ResourceLog.objects.all()) == 4 + assert ResourceLog.objects.all()[3].level == 'info' + assert ResourceLog.objects.all()[3].message == u'connector "éléphant" (Feed) is back up' -- 2.23.0.rc1