0002-log-errors-for-down-connectors-when-it-persists-3538.patch
passerelle/base/migrations/0014_auto_20190820_0914.py | ||
---|---|---|
1 |
# -*- coding: utf-8 -*- |
|
2 |
# Generated by Django 1.11.20 on 2019-08-20 14:14 |
|
3 |
from __future__ import unicode_literals |
|
4 | ||
5 |
from django.db import migrations, models |
|
6 |
import passerelle.base.models |
|
7 | ||
8 | ||
9 |
class Migration(migrations.Migration): |
|
10 | ||
11 |
dependencies = [ |
|
12 |
('base', '0013_delete_templatevar'), |
|
13 |
] |
|
14 | ||
15 |
operations = [ |
|
16 |
migrations.AddField( |
|
17 |
model_name='availabilityparameters', |
|
18 |
name='notification_delays', |
|
19 |
field=models.TextField(default=b'0', validators=[passerelle.base.models.validate_notification_delays]), |
|
20 |
), |
|
21 |
migrations.AddField( |
|
22 |
model_name='resourcestatus', |
|
23 |
name='last_notification_timestamp', |
|
24 |
field=models.DateTimeField(blank=True, null=True), |
|
25 |
), |
|
26 |
] |
passerelle/base/models.py | ||
---|---|---|
8 | 8 |
import sys |
9 | 9 |
import traceback |
10 | 10 |
import base64 |
11 |
import itertools |
|
11 | 12 | |
12 | 13 |
import django |
13 | 14 |
from django.apps import apps |
... | ... | |
20 | 21 |
from django.utils.text import slugify |
21 | 22 |
from django.utils import timezone, six |
22 | 23 |
from django.utils.translation import ugettext_lazy as _ |
24 |
from django.utils.timezone import now |
|
23 | 25 |
from django.core.files.base import ContentFile |
24 | 26 | |
25 | 27 |
from django.contrib.contenttypes.models import ContentType |
... | ... | |
401 | 403 |
if not self.availability_parameters.run_check: |
402 | 404 |
return |
403 | 405 | |
404 |
currently_down = self.down() |
|
406 |
availability_parameters = self.availability_parameters |
|
407 | ||
405 | 408 |
try: |
406 | 409 |
self.check_status() |
407 | 410 |
status = 'up' |
... | ... | |
409 | 412 |
except NotImplementedError: |
410 | 413 |
return |
411 | 414 |
except Exception as e: |
415 |
from passerelle.utils.conversion import exception_to_text |
|
416 | ||
412 | 417 |
status = 'down' |
413 |
message = repr(e)[:500]
|
|
418 |
message = exception_to_text(e)[:500]
|
|
414 | 419 | |
415 | 420 |
resource_type = ContentType.objects.get_for_model(self) |
416 | 421 |
current_status = ResourceStatus.objects.filter( |
417 |
resource_type=resource_type,
|
|
418 |
resource_pk=self.pk).first()
|
|
422 |
resource_type=resource_type, |
|
423 |
resource_pk=self.pk).first() |
|
419 | 424 |
if not current_status or status != current_status.status: |
420 |
if status == 'down' and not currently_down: |
|
421 |
self.logger.error(u'connector "%s" (%s) is now down', self, self.__class__.__name__) |
|
425 |
if status == 'down' and not self.down(): # new downtime |
|
426 |
if availability_parameters.has_zero_delay(): |
|
427 |
self.logger.error(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message) |
|
428 |
else: |
|
429 |
self.logger.warning(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message) |
|
422 | 430 |
ResourceStatus( |
423 | 431 |
resource_type=resource_type, |
424 | 432 |
resource_pk=self.pk, |
425 | 433 |
status=status, |
426 | 434 |
message=message).save() |
427 |
if status == 'up' and currently_down:
|
|
435 |
if status == 'up' and current_status:
|
|
428 | 436 |
self.logger.info(u'connector "%s" (%s) is back up', self, self.__class__.__name__) |
429 | 437 |
elif status == 'down': |
430 |
current_status.message = message |
|
431 |
current_status.save() |
|
438 |
# check last_notification_downtime and current downtime to see if it matches a new notification delay |
|
439 |
last_notification_timestamp = current_status.last_notification_timestamp or current_status.start_timestamp |
|
440 | ||
441 |
current_time = now() |
|
442 |
downtime = (current_time - current_status.start_timestamp).total_seconds() // 60 |
|
443 |
last_notification_downtime = ( |
|
444 |
last_notification_timestamp - current_status.start_timestamp).total_seconds() // 60 |
|
445 | ||
446 |
for delay in availability_parameters.notification_delays_generator(): |
|
447 |
if not delay: |
|
448 |
continue |
|
449 |
if downtime >= delay: |
|
450 |
if last_notification_downtime < delay: |
|
451 |
days = downtime // 1440 |
|
452 |
hours = downtime // 60 |
|
453 |
if days > 1: |
|
454 |
human_duration = 'for %d days' % days |
|
455 |
elif hours > 1: |
|
456 |
human_duration = 'for %d hours' % hours |
|
457 |
else: |
|
458 |
human_duration = 'for %d minutes' % downtime |
|
459 |
self.logger.error(u'connector "%s" (%s) is still down %s: %s', |
|
460 |
self, self.__class__.__name__, |
|
461 |
human_duration, |
|
462 |
message, |
|
463 |
# when connector is down, logging is shutdown |
|
464 |
force=True) |
|
465 |
ResourceStatus.objects.filter(pk=current_status.pk).update( |
|
466 |
message=message, last_notification_timestamp=current_time) |
|
467 |
break |
|
468 |
else: |
|
469 |
break |
|
432 | 470 | |
433 | 471 |
def hourly(self): |
434 | 472 |
pass |
... | ... | |
537 | 575 |
unique_together = (('resource_type', 'resource_pk')) |
538 | 576 | |
539 | 577 | |
578 |
def parse_notification_delays(value): |
|
579 |
delays = [int(v.strip()) for v in value.split(',')] |
|
580 |
if not all(delay >= 0 for delay in delays): |
|
581 |
raise ValueError |
|
582 |
if delays != sorted(delays): |
|
583 |
raise ValueError |
|
584 |
if len(set(delays)) != len(delays): |
|
585 |
raise ValueError |
|
586 |
return delays |
|
587 | ||
588 | ||
589 |
def validate_notification_delays(value): |
|
590 |
value = value or u'' |
|
591 |
try: |
|
592 |
parse_notification_delays(value) |
|
593 |
except ValueError: |
|
594 |
raise ValidationError(_('You must provide a list of increasing integers')) |
|
595 | ||
596 | ||
540 | 597 |
class AvailabilityParameters(models.Model): |
541 | 598 |
resource_type = models.ForeignKey(ContentType) |
542 | 599 |
resource_pk = models.PositiveIntegerField() |
... | ... | |
544 | 601 |
run_check = models.BooleanField( |
545 | 602 |
default=True, verbose_name=_('Run regular availability checks'), |
546 | 603 |
help_text=_('Run an availability check every 5 minutes')) |
604 |
notification_delays = models.TextField( |
|
605 |
default='0', |
|
606 |
blank=False, |
|
607 |
validators=[validate_notification_delays]) |
|
608 | ||
609 |
def has_zero_delay(self): |
|
610 |
return 0 in parse_notification_delays(self.notification_delays) |
|
611 | ||
612 |
def notification_delays_generator(self): |
|
613 |
notification_delays = parse_notification_delays(self.notification_delays) |
|
614 |
last_notification_delay = notification_delays[-1] |
|
615 |
if last_notification_delay > 1: |
|
616 |
notification_delays = itertools.chain( |
|
617 |
notification_delays, |
|
618 |
itertools.count(2 * last_notification_delay, last_notification_delay)) |
|
619 |
return notification_delays |
|
547 | 620 | |
548 | 621 |
class Meta: |
549 | 622 |
unique_together = (('resource_type', 'resource_pk')) |
... | ... | |
609 | 682 |
start_timestamp = models.DateTimeField(auto_now_add=True) |
610 | 683 |
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='unknown') |
611 | 684 |
message = models.CharField(max_length=500, blank=True) |
685 |
last_notification_timestamp = models.DateTimeField(blank=True, null=True) |
|
612 | 686 | |
613 | 687 |
class Meta: |
614 | 688 |
ordering = ['-start_timestamp'] |
... | ... | |
635 | 709 |
return self._logger.getEffectiveLevel() |
636 | 710 | |
637 | 711 |
def _log(self, levelname, message, *args, **kwargs): |
638 |
if self.connector.down(): |
|
712 |
force = kwargs.pop('force', False) |
|
713 |
if self.connector.down() and not force: |
|
639 | 714 |
# don't log if the connector is known to be down |
640 | 715 |
return |
641 | 716 |
levelno = getattr(logging, levelname) |
tests/test_proxylogger.py | ||
---|---|---|
1 | 1 |
# -*- coding: utf-8 -*- |
2 | 2 | |
3 | 3 |
import logging |
4 |
import datetime |
|
5 |
import itertools |
|
4 | 6 | |
5 | 7 |
import pytest |
6 | 8 |
from httmock import all_requests, HTTMock |
7 | 9 | |
8 |
from passerelle.base.models import ProxyLogger, ResourceLog |
|
10 |
from django.core.exceptions import ValidationError |
|
11 | ||
12 |
from passerelle.base.models import ProxyLogger, ResourceLog, AvailabilityParameters |
|
9 | 13 |
from passerelle.apps.feeds.models import Feed |
10 | 14 | |
11 | 15 |
from .test_availability import down_mock, up_mock |
... | ... | |
66 | 70 |
pr.debug(u'some message') |
67 | 71 |
assert len(ResourceLog.objects.all()) == 0 |
68 | 72 | |
69 |
def test_log_on_connector_availability_change(db, connector): |
|
73 | ||
74 |
def test_validate_notification_delays(db, connector): |
|
75 |
def take(iterator, count): |
|
76 |
return list(itertools.compress(iterator, range(1, count + 1))) |
|
77 | ||
78 |
availability_parameters = connector.availability_parameters |
|
79 |
assert availability_parameters.notification_delays == '0' |
|
80 | ||
81 |
availability_parameters.full_clean() |
|
82 |
assert availability_parameters.has_zero_delay() |
|
83 |
assert list(availability_parameters.notification_delays_generator()) == [0] |
|
84 | ||
85 |
availability_parameters.notification_delays = '0,5,100,1000' |
|
86 |
availability_parameters.full_clean() |
|
87 |
assert availability_parameters.has_zero_delay() |
|
88 |
assert take(availability_parameters.notification_delays_generator(), 6) == [0, 5, 100, 1000, 2000, 3000] |
|
89 | ||
90 |
availability_parameters.notification_delays = '5,100,1000' |
|
91 |
availability_parameters.full_clean() |
|
92 |
assert not availability_parameters.has_zero_delay() |
|
93 |
assert take(availability_parameters.notification_delays_generator(), 6) == [5, 100, 1000, 2000, 3000, 4000] |
|
94 | ||
95 |
availability_parameters.notification_delays = '5' |
|
96 |
availability_parameters.full_clean() |
|
97 |
assert not availability_parameters.has_zero_delay() |
|
98 |
assert take(availability_parameters.notification_delays_generator(), 3) == [5, 10, 15] |
|
99 | ||
100 |
# validation errors |
|
101 |
availability_parameters.notification_delays = '5,100,100' |
|
102 |
with pytest.raises(ValidationError): |
|
103 |
availability_parameters.full_clean() |
|
104 | ||
105 |
availability_parameters.notification_delays = '' |
|
106 |
with pytest.raises(ValidationError): |
|
107 |
availability_parameters.full_clean() |
|
108 | ||
109 |
availability_parameters.notification_delays = '1,x' |
|
110 |
with pytest.raises(ValidationError): |
|
111 |
availability_parameters.full_clean() |
|
112 | ||
113 |
availability_parameters.notification_delays = '10,1' |
|
114 |
with pytest.raises(ValidationError): |
|
115 |
availability_parameters.full_clean() |
|
116 | ||
117 | ||
118 |
@pytest.mark.parametrize('notification_delays', ['0', '0,5,100', '5,100']) |
|
119 |
def test_log_on_connector_availability_change(db, connector, freezer, notification_delays): |
|
70 | 120 |
connector.title = u'éléphant' |
121 |
availability_parameters = connector.availability_parameters |
|
122 |
availability_parameters.notification_delays = notification_delays |
|
123 |
availability_parameters.save() |
|
124 | ||
71 | 125 |
with HTTMock(up_mock): # set connector as up |
72 | 126 |
connector.availability() |
127 |
assert ResourceLog.objects.count() == 1 |
|
128 |
assert ResourceLog.objects.latest('id').level == 'info' |
|
129 |
assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)' |
|
73 | 130 |
ResourceLog.objects.all().delete() |
131 | ||
132 |
# move 5 minutes in the future |
|
133 |
freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1)) |
|
134 | ||
74 | 135 |
with HTTMock(down_mock): # set connector as down |
75 | 136 |
connector.availability() |
76 |
assert len(ResourceLog.objects.all()) == 2 |
|
137 |
assert connector.down() |
|
138 |
last_count1 = ResourceLog.objects.count() |
|
139 |
assert last_count1 == 2 |
|
77 | 140 |
assert ResourceLog.objects.all()[0].message == 'GET http://example.net/ (=> 404)' |
78 |
assert ResourceLog.objects.all()[1].level == 'error' |
|
79 |
assert ResourceLog.objects.all()[1].message == u'connector "éléphant" (Feed) is now down' |
|
141 |
assert ResourceLog.objects.all()[1].level == 'error' if notification_delays.startswith('0') else 'warning' |
|
142 |
assert (u'connector "éléphant" (Feed) is now down: 404 Client' |
|
143 |
in ResourceLog.objects.all()[1].message) |
|
144 | ||
145 |
# move 5 minutes in the future |
|
146 |
freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1)) |
|
147 | ||
148 |
# second time log as error |
|
149 |
with HTTMock(down_mock): # connector is still down |
|
150 |
connector.availability() |
|
151 |
assert connector.down() |
|
152 |
last_count2 = ResourceLog.objects.count() |
|
153 |
if notification_delays == '0': |
|
154 |
assert last_count2 == last_count1 |
|
155 |
else: |
|
156 |
assert last_count2 == last_count1 + 1 |
|
157 |
assert ResourceLog.objects.all()[2].level == 'error' if notification_delays != '0' else 'warning' |
|
158 |
assert (u'connector "éléphant" (Feed) is still down for 5 minutes: 404' |
|
159 |
in ResourceLog.objects.all()[2].message) |
|
160 | ||
161 |
# move 3 minutes in the future |
|
162 |
freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1)) |
|
163 | ||
164 |
# third time no log |
|
165 |
with HTTMock(down_mock): # connector is still down |
|
166 |
connector.availability() |
|
167 |
assert ResourceLog.objects.count() == last_count2 |
|
168 | ||
169 |
# move 3 minutes in the future |
|
170 |
freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1)) |
|
171 | ||
172 |
with HTTMock(up_mock): # set connector as up |
|
173 |
connector.availability() |
|
174 |
assert not connector.down() |
|
175 |
last_count3 = ResourceLog.objects.count() |
|
176 |
assert last_count3 == last_count2 + 1 |
|
177 |
assert ResourceLog.objects.latest('id').level == 'info' |
|
178 |
assert ResourceLog.objects.latest('id').message == u'connector "éléphant" (Feed) is back up' |
|
179 | ||
180 |
# move 3 minutes in the future |
|
181 |
freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1)) |
|
80 | 182 | |
183 |
# last message is a GET |
|
81 | 184 |
with HTTMock(up_mock): # set connector as up |
82 | 185 |
connector.availability() |
83 |
assert len(ResourceLog.objects.all()) == 3 |
|
84 |
assert ResourceLog.objects.all()[2].level == 'info' |
|
85 |
assert ResourceLog.objects.all()[2].message == u'connector "éléphant" (Feed) is back up' |
|
186 |
assert not connector.down() |
|
187 |
last_count4 = ResourceLog.objects.count() |
|
188 |
assert last_count4 == last_count3 + 1 |
|
189 |
assert ResourceLog.objects.latest('id').level == 'info' |
|
190 |
assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)' |
|
86 |
- |