Projet

Général

Profil

0002-log-errors-for-down-connectors-when-it-persists-3538.patch

Benjamin Dauvergne, 20 août 2019 17:17

Télécharger (14,9 ko)

Voir les différences:

Subject: [PATCH 2/2] log errors for down connectors when it persists (#35380)

 .../migrations/0014_auto_20190820_0914.py     |  26 ++++
 passerelle/base/models.py                     |  95 ++++++++++++--
 tests/test_proxylogger.py                     | 121 ++++++++++++++++--
 3 files changed, 224 insertions(+), 18 deletions(-)
 create mode 100644 passerelle/base/migrations/0014_auto_20190820_0914.py
passerelle/base/migrations/0014_auto_20190820_0914.py
1
# -*- coding: utf-8 -*-
2
# Generated by Django 1.11.20 on 2019-08-20 14:14
3
from __future__ import unicode_literals
4

  
5
from django.db import migrations, models
6
import passerelle.base.models
7

  
8

  
9
class Migration(migrations.Migration):
10

  
11
    dependencies = [
12
        ('base', '0013_delete_templatevar'),
13
    ]
14

  
15
    operations = [
16
        migrations.AddField(
17
            model_name='availabilityparameters',
18
            name='notification_delays',
19
            field=models.TextField(default=b'0', validators=[passerelle.base.models.validate_notification_delays]),
20
        ),
21
        migrations.AddField(
22
            model_name='resourcestatus',
23
            name='last_notification_timestamp',
24
            field=models.DateTimeField(blank=True, null=True),
25
        ),
26
    ]
passerelle/base/models.py
8 8
import sys
9 9
import traceback
10 10
import base64
11
import itertools
11 12

  
12 13
import django
13 14
from django.apps import apps
......
20 21
from django.utils.text import slugify
21 22
from django.utils import timezone, six
22 23
from django.utils.translation import ugettext_lazy as _
24
from django.utils.timezone import now
23 25
from django.core.files.base import ContentFile
24 26

  
25 27
from django.contrib.contenttypes.models import ContentType
......
401 403
        if not self.availability_parameters.run_check:
402 404
            return
403 405

  
404
        currently_down = self.down()
406
        availability_parameters = self.availability_parameters
407

  
405 408
        try:
406 409
            self.check_status()
407 410
            status = 'up'
......
409 412
        except NotImplementedError:
410 413
            return
411 414
        except Exception as e:
415
            from passerelle.utils.conversion import exception_to_text
416

  
412 417
            status = 'down'
413
            message = repr(e)[:500]
418
            message = exception_to_text(e)[:500]
414 419

  
415 420
        resource_type = ContentType.objects.get_for_model(self)
416 421
        current_status = ResourceStatus.objects.filter(
417
                resource_type=resource_type,
418
                resource_pk=self.pk).first()
422
            resource_type=resource_type,
423
            resource_pk=self.pk).first()
419 424
        if not current_status or status != current_status.status:
420
            if status == 'down' and not currently_down:
421
                self.logger.error(u'connector "%s" (%s) is now down', self, self.__class__.__name__)
425
            if status == 'down' and not self.down():  # new downtime
426
                if availability_parameters.has_zero_delay():
427
                    self.logger.error(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message)
428
                else:
429
                    self.logger.warning(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message)
422 430
            ResourceStatus(
423 431
                resource_type=resource_type,
424 432
                resource_pk=self.pk,
425 433
                status=status,
426 434
                message=message).save()
427
            if status == 'up' and currently_down:
435
            if status == 'up' and current_status:
428 436
                self.logger.info(u'connector "%s" (%s) is back up', self, self.__class__.__name__)
429 437
        elif status == 'down':
430
            current_status.message = message
431
            current_status.save()
438
            # check last_notification_downtime and current downtime to see if it matches a new notification delay
439
            last_notification_timestamp = current_status.last_notification_timestamp or current_status.start_timestamp
440

  
441
            current_time = now()
442
            downtime = (current_time - current_status.start_timestamp).total_seconds() // 60
443
            last_notification_downtime = (
444
                last_notification_timestamp - current_status.start_timestamp).total_seconds() // 60
445

  
446
            for delay in availability_parameters.notification_delays_generator():
447
                if not delay:
448
                    continue
449
                if downtime >= delay:
450
                    if last_notification_downtime < delay:
451
                        days = downtime // 1440
452
                        hours = downtime // 60
453
                        if days > 1:
454
                            human_duration = 'for %d days' % days
455
                        elif hours > 1:
456
                            human_duration = 'for %d hours' % hours
457
                        else:
458
                            human_duration = 'for %d minutes' % downtime
459
                        self.logger.error(u'connector "%s" (%s) is still down %s: %s',
460
                                          self, self.__class__.__name__,
461
                                          human_duration,
462
                                          message,
463
                                          # when connector is down, logging is shutdown
464
                                          force=True)
465
                        ResourceStatus.objects.filter(pk=current_status.pk).update(
466
                            message=message, last_notification_timestamp=current_time)
467
                        break
468
                else:
469
                    break
432 470

  
433 471
    def hourly(self):
434 472
        pass
......
537 575
        unique_together = (('resource_type', 'resource_pk'))
538 576

  
539 577

  
578
def parse_notification_delays(value):
579
    delays = [int(v.strip()) for v in value.split(',')]
580
    if not all(delay >= 0 for delay in delays):
581
        raise ValueError
582
    if delays != sorted(delays):
583
        raise ValueError
584
    if len(set(delays)) != len(delays):
585
        raise ValueError
586
    return delays
587

  
588

  
589
def validate_notification_delays(value):
590
    value = value or u''
591
    try:
592
        parse_notification_delays(value)
593
    except ValueError:
594
        raise ValidationError(_('You must provide a list of increasing integers'))
595

  
596

  
540 597
class AvailabilityParameters(models.Model):
541 598
    resource_type = models.ForeignKey(ContentType)
542 599
    resource_pk = models.PositiveIntegerField()
......
544 601
    run_check = models.BooleanField(
545 602
        default=True, verbose_name=_('Run regular availability checks'),
546 603
        help_text=_('Run an availability check every 5 minutes'))
604
    notification_delays = models.TextField(
605
        default='0',
606
        blank=False,
607
        validators=[validate_notification_delays])
608

  
609
    def has_zero_delay(self):
610
        return 0 in parse_notification_delays(self.notification_delays)
611

  
612
    def notification_delays_generator(self):
613
        notification_delays = parse_notification_delays(self.notification_delays)
614
        last_notification_delay = notification_delays[-1]
615
        if last_notification_delay > 1:
616
            notification_delays = itertools.chain(
617
                notification_delays,
618
                itertools.count(2 * last_notification_delay, last_notification_delay))
619
        return notification_delays
547 620

  
548 621
    class Meta:
549 622
        unique_together = (('resource_type', 'resource_pk'))
......
609 682
    start_timestamp = models.DateTimeField(auto_now_add=True)
610 683
    status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='unknown')
611 684
    message = models.CharField(max_length=500, blank=True)
685
    last_notification_timestamp = models.DateTimeField(blank=True, null=True)
612 686

  
613 687
    class Meta:
614 688
        ordering = ['-start_timestamp']
......
635 709
        return self._logger.getEffectiveLevel()
636 710

  
637 711
    def _log(self, levelname, message, *args, **kwargs):
638
        if self.connector.down():
712
        force = kwargs.pop('force', False)
713
        if self.connector.down() and not force:
639 714
            # don't log if the connector is known to be down
640 715
            return
641 716
        levelno = getattr(logging, levelname)
tests/test_proxylogger.py
1 1
# -*- coding: utf-8 -*-
2 2

  
3 3
import logging
4
import datetime
5
import itertools
4 6

  
5 7
import pytest
6 8
from httmock import all_requests, HTTMock
7 9

  
8
from passerelle.base.models import ProxyLogger, ResourceLog
10
from django.core.exceptions import ValidationError
11

  
12
from passerelle.base.models import ProxyLogger, ResourceLog, AvailabilityParameters
9 13
from passerelle.apps.feeds.models import Feed
10 14

  
11 15
from .test_availability import down_mock, up_mock
......
66 70
    pr.debug(u'some message')
67 71
    assert len(ResourceLog.objects.all()) == 0
68 72

  
69
def test_log_on_connector_availability_change(db, connector):
73

  
74
def test_validate_notification_delays(db, connector):
75
    def take(iterator, count):
76
        return list(itertools.compress(iterator, range(1, count + 1)))
77

  
78
    availability_parameters = connector.availability_parameters
79
    assert availability_parameters.notification_delays == '0'
80

  
81
    availability_parameters.full_clean()
82
    assert availability_parameters.has_zero_delay()
83
    assert list(availability_parameters.notification_delays_generator()) == [0]
84

  
85
    availability_parameters.notification_delays = '0,5,100,1000'
86
    availability_parameters.full_clean()
87
    assert availability_parameters.has_zero_delay()
88
    assert take(availability_parameters.notification_delays_generator(), 6) == [0, 5, 100, 1000, 2000, 3000]
89

  
90
    availability_parameters.notification_delays = '5,100,1000'
91
    availability_parameters.full_clean()
92
    assert not availability_parameters.has_zero_delay()
93
    assert take(availability_parameters.notification_delays_generator(), 6) == [5, 100, 1000, 2000, 3000, 4000]
94

  
95
    availability_parameters.notification_delays = '5'
96
    availability_parameters.full_clean()
97
    assert not availability_parameters.has_zero_delay()
98
    assert take(availability_parameters.notification_delays_generator(), 3) == [5, 10, 15]
99

  
100
    # validation errors
101
    availability_parameters.notification_delays = '5,100,100'
102
    with pytest.raises(ValidationError):
103
        availability_parameters.full_clean()
104

  
105
    availability_parameters.notification_delays = ''
106
    with pytest.raises(ValidationError):
107
        availability_parameters.full_clean()
108

  
109
    availability_parameters.notification_delays = '1,x'
110
    with pytest.raises(ValidationError):
111
        availability_parameters.full_clean()
112

  
113
    availability_parameters.notification_delays = '10,1'
114
    with pytest.raises(ValidationError):
115
        availability_parameters.full_clean()
116

  
117

  
118
@pytest.mark.parametrize('notification_delays', ['0', '0,5,100', '5,100'])
119
def test_log_on_connector_availability_change(db, connector, freezer, notification_delays):
70 120
    connector.title = u'éléphant'
121
    availability_parameters = connector.availability_parameters
122
    availability_parameters.notification_delays = notification_delays
123
    availability_parameters.save()
124

  
71 125
    with HTTMock(up_mock):  # set connector as up
72 126
        connector.availability()
127
    assert ResourceLog.objects.count() == 1
128
    assert ResourceLog.objects.latest('id').level == 'info'
129
    assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)'
73 130
    ResourceLog.objects.all().delete()
131

  
132
    # move 5 minutes in the future
133
    freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1))
134

  
74 135
    with HTTMock(down_mock):  # set connector as down
75 136
        connector.availability()
76
    assert len(ResourceLog.objects.all()) == 2
137
    assert connector.down()
138
    last_count1 = ResourceLog.objects.count()
139
    assert last_count1 == 2
77 140
    assert ResourceLog.objects.all()[0].message == 'GET http://example.net/ (=> 404)'
78
    assert ResourceLog.objects.all()[1].level == 'error'
79
    assert ResourceLog.objects.all()[1].message == u'connector "éléphant" (Feed) is now down'
141
    assert ResourceLog.objects.all()[1].level == 'error' if notification_delays.startswith('0') else 'warning'
142
    assert (u'connector "éléphant" (Feed) is now down: 404 Client'
143
            in ResourceLog.objects.all()[1].message)
144

  
145
    # move 5 minutes in the future
146
    freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1))
147

  
148
    # second time log as error
149
    with HTTMock(down_mock):  # connector is still down
150
        connector.availability()
151
    assert connector.down()
152
    last_count2 = ResourceLog.objects.count()
153
    if notification_delays == '0':
154
        assert last_count2 == last_count1
155
    else:
156
        assert last_count2 == last_count1 + 1
157
        assert ResourceLog.objects.all()[2].level == 'error' if notification_delays != '0' else 'warning'
158
        assert (u'connector "éléphant" (Feed) is still down for 5 minutes: 404'
159
                in ResourceLog.objects.all()[2].message)
160

  
161
    # move 3 minutes in the future
162
    freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1))
163

  
164
    # third time no log
165
    with HTTMock(down_mock):  # connector is still down
166
        connector.availability()
167
    assert ResourceLog.objects.count() == last_count2
168

  
169
    # move 3 minutes in the future
170
    freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1))
171

  
172
    with HTTMock(up_mock):  # set connector as up
173
        connector.availability()
174
    assert not connector.down()
175
    last_count3 = ResourceLog.objects.count()
176
    assert last_count3 == last_count2 + 1
177
    assert ResourceLog.objects.latest('id').level == 'info'
178
    assert ResourceLog.objects.latest('id').message == u'connector "éléphant" (Feed) is back up'
179

  
180
    # move 3 minutes in the future
181
    freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1))
80 182

  
183
    # last message is a GET
81 184
    with HTTMock(up_mock):  # set connector as up
82 185
        connector.availability()
83
    assert len(ResourceLog.objects.all()) == 3
84
    assert ResourceLog.objects.all()[2].level == 'info'
85
    assert ResourceLog.objects.all()[2].message == u'connector "éléphant" (Feed) is back up'
186
    assert not connector.down()
187
    last_count4 = ResourceLog.objects.count()
188
    assert last_count4 == last_count3 + 1
189
    assert ResourceLog.objects.latest('id').level == 'info'
190
    assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)'
86
-