Projet

Général

Profil

0002-log-errors-for-down-connectors-when-it-persists-3538.patch

Benjamin Dauvergne, 12 août 2019 18:46

Télécharger (7,97 ko)

Voir les différences:

Subject: [PATCH 2/2] log errors for down connectors when it persists (#35380)

 .../0014_resourcestatus_counts_checked.py     | 20 ++++++++++
 passerelle/base/models.py                     | 38 ++++++++++++++++---
 tests/test_proxylogger.py                     | 30 ++++++++++++---
 3 files changed, 76 insertions(+), 12 deletions(-)
 create mode 100644 passerelle/base/migrations/0014_resourcestatus_counts_checked.py
passerelle/base/migrations/0014_resourcestatus_counts_checked.py
1
# -*- coding: utf-8 -*-
2
# Generated by Django 1.11.20 on 2019-08-12 16:17
3
from __future__ import unicode_literals
4

  
5
from django.db import migrations, models
6

  
7

  
8
class Migration(migrations.Migration):
9

  
10
    dependencies = [
11
        ('base', '0013_delete_templatevar'),
12
    ]
13

  
14
    operations = [
15
        migrations.AddField(
16
            model_name='resourcestatus',
17
            name='counts_checked',
18
            field=models.PositiveIntegerField(default=1),
19
        ),
20
    ]
passerelle/base/models.py
15 15
from django.core.exceptions import ValidationError, ObjectDoesNotExist, PermissionDenied
16 16
from django.core.urlresolvers import reverse
17 17
from django.db import connection, models, transaction
18
from django.db.models import Q
18
from django.db.models import Q, F
19 19
from django.test import override_settings
20 20
from django.utils.text import slugify
21 21
from django.utils import timezone, six
22 22
from django.utils.translation import ugettext_lazy as _
23
from django.utils.encoding import force_text
24
from django.utils.timezone import now
23 25
from django.core.files.base import ContentFile
24 26

  
25 27
from django.contrib.contenttypes.models import ContentType
......
409 411
        except NotImplementedError:
410 412
            return
411 413
        except Exception as e:
414
            from passerelle.utils.conversion import exception_to_text
415

  
412 416
            status = 'down'
413
            message = repr(e)[:500]
417
            message = exception_to_text(e)[:500]
414 418

  
415 419
        resource_type = ContentType.objects.get_for_model(self)
416 420
        current_status = ResourceStatus.objects.filter(
......
418 422
                resource_pk=self.pk).first()
419 423
        if not current_status or status != current_status.status:
420 424
            if status == 'down' and not currently_down:
421
                self.logger.error(u'connector "%s" (%s) is now down', self, self.__class__.__name__)
425
                self.logger.warning(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message)
422 426
            ResourceStatus(
423 427
                resource_type=resource_type,
424 428
                resource_pk=self.pk,
......
427 431
            if status == 'up' and currently_down:
428 432
                self.logger.info(u'connector "%s" (%s) is back up', self, self.__class__.__name__)
429 433
        elif status == 'down':
430
            current_status.message = message
431
            current_status.save()
434
            # availability cron is run every 5 minutes
435
            # so we will get a mail 5, 50, 500, 1500, 3000 then every 5000
436
            # minutes after the first down event
437
            if (current_status.counts_checked in (1, 10, 100, 300, 600, 1000)
438
                    or current_status.counts_checked % 1000 == 0):
439
                duration = now() - current_status.start_timestamp
440
                days = duration.total_seconds() // 86400
441
                hours = duration.total_seconds() // 3600
442
                minutes = duration.total_seconds() // 60
443
                if days > 1:
444
                    human_duration = 'for %d days' % days
445
                elif hours > 1:
446
                    human_duration = 'for %d hours' % hours
447
                else:
448
                    human_duration = 'for %d minutes' % minutes
449
                self.logger.error(u'connector "%s" (%s) is still down %s: %s',
450
                                  self, self.__class__.__name__,
451
                                  human_duration,
452
                                  message,
453
                                  # when connector is down, logging is shutdown
454
                                  force=True)
455
            ResourceStatus.objects.filter(pk=current_status.pk).update(
456
                message=message, counts_checked=F('counts_checked') + 1)
432 457

  
433 458
    def hourly(self):
434 459
        pass
......
609 634
    start_timestamp = models.DateTimeField(auto_now_add=True)
610 635
    status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='unknown')
611 636
    message = models.CharField(max_length=500, blank=True)
637
    counts_checked = models.PositiveIntegerField(default=1)
612 638

  
613 639
    class Meta:
614 640
        ordering = ['-start_timestamp']
......
635 661
        return self._logger.getEffectiveLevel()
636 662

  
637 663
    def _log(self, levelname, message, *args, **kwargs):
638
        if self.connector.down():
664
        if self.connector.down() and not kwargs.pop('force', False):
639 665
            # don't log if the connector is known to be down
640 666
            return
641 667
        levelno = getattr(logging, levelname)
tests/test_proxylogger.py
1 1
# -*- coding: utf-8 -*-
2 2

  
3 3
import logging
4
import datetime
4 5

  
5 6
import pytest
6 7
from httmock import all_requests, HTTMock
......
66 67
    pr.debug(u'some message')
67 68
    assert len(ResourceLog.objects.all()) == 0
68 69

  
69
def test_log_on_connector_availability_change(db, connector):
70
def test_log_on_connector_availability_change(db, connector, freezer):
70 71
    connector.title = u'éléphant'
71 72
    with HTTMock(up_mock):  # set connector as up
72 73
        connector.availability()
......
75 76
        connector.availability()
76 77
    assert len(ResourceLog.objects.all()) == 2
77 78
    assert ResourceLog.objects.all()[0].message == 'GET http://example.net/ (=> 404)'
78
    assert ResourceLog.objects.all()[1].level == 'error'
79
    assert ResourceLog.objects.all()[1].message == u'connector "éléphant" (Feed) is now down'
79
    assert ResourceLog.objects.all()[1].level == 'warning'
80
    assert (u'connector "éléphant" (Feed) is now down: 404 Client'
81
            in ResourceLog.objects.all()[1].message)
80 82

  
81
    with HTTMock(up_mock):  # set connector as up
83
    # move 5 minutes in the future
84
    freezer.move_to(datetime.timedelta(seconds=60 * 5))
85

  
86
    # second time log as error
87
    with HTTMock(down_mock):  # connector is still down
88
        connector.availability()
89
    assert len(ResourceLog.objects.all()) == 3
90
    assert ResourceLog.objects.all()[2].level == 'error'
91
    assert (u'connector "éléphant" (Feed) is still down for 5 minutes: 404'
92
            in ResourceLog.objects.all()[2].message)
93

  
94
    # third time no log
95
    with HTTMock(down_mock):  # connector is still down
82 96
        connector.availability()
83 97
    assert len(ResourceLog.objects.all()) == 3
84
    assert ResourceLog.objects.all()[2].level == 'info'
85
    assert ResourceLog.objects.all()[2].message == u'connector "éléphant" (Feed) is back up'
98

  
99
    with HTTMock(up_mock):  # set connector as up
100
        connector.availability()
101
    assert len(ResourceLog.objects.all()) == 4
102
    assert ResourceLog.objects.all()[3].level == 'info'
103
    assert ResourceLog.objects.all()[3].message == u'connector "éléphant" (Feed) is back up'
86
-