Projet

Général

Profil

0002-log-errors-for-down-connectors-when-it-persists-3538.patch

Benjamin Dauvergne, 20 août 2019 18:41

Télécharger (21,2 ko)

Voir les différences:

Subject: [PATCH 2/2] log errors for down connectors when it persists (#35380)

 passerelle/base/forms.py                      |  25 +++-
 .../migrations/0014_auto_20190820_0914.py     |  26 ++++
 passerelle/base/models.py                     | 102 +++++++++++++--
 passerelle/base/views.py                      |  61 ++++-----
 tests/test_proxylogger.py                     | 121 ++++++++++++++++--
 5 files changed, 286 insertions(+), 49 deletions(-)
 create mode 100644 passerelle/base/migrations/0014_auto_20190820_0914.py
passerelle/base/forms.py
1 1
from django import forms
2 2

  
3
from .models import ApiUser, AccessRight
3
from .models import ApiUser, AccessRight, AvailabilityParameters, parse_notification_delays
4

  
4 5

  
5 6
class ApiUserForm(forms.ModelForm):
6 7
    class Meta:
......
17 18
            'resource_type': forms.HiddenInput(),
18 19
            'resource_pk': forms.HiddenInput(),
19 20
        }
21

  
22

  
23
class AvailabilityParametersForm(forms.ModelForm):
24
    # pre-run validation for the form to work as a popup
25
    # i.e. never raise a ValidationError but use django.contrib.messages in the view
26
    def clean(self):
27
        cleaned_data = self.cleaned_data
28
        try:
29
            notification_delays = cleaned_data.get('notification_delays')
30
            if not notification_delays:
31
                raise ValueError
32
            parse_notification_delays(notification_delays)
33
        except ValueError:
34
            cleaned_data['notification_delays'] = self.initial['notification_delays']
35
        return cleaned_data
36

  
37
    class Meta:
38
        model = AvailabilityParameters
39
        fields = ['run_check', 'notification_delays']
40
        widgets = {
41
            'notification_delays': forms.TextInput,
42
        }
passerelle/base/migrations/0014_auto_20190820_0914.py
1
# -*- coding: utf-8 -*-
2
# Generated by Django 1.11.20 on 2019-08-20 14:14
3
from __future__ import unicode_literals
4

  
5
from django.db import migrations, models
6
import passerelle.base.models
7

  
8

  
9
class Migration(migrations.Migration):
10

  
11
    dependencies = [
12
        ('base', '0013_delete_templatevar'),
13
    ]
14

  
15
    operations = [
16
        migrations.AddField(
17
            model_name='availabilityparameters',
18
            name='notification_delays',
19
            field=models.TextField(default=b'0', validators=[passerelle.base.models.validate_notification_delays]),
20
        ),
21
        migrations.AddField(
22
            model_name='resourcestatus',
23
            name='last_notification_timestamp',
24
            field=models.DateTimeField(blank=True, null=True),
25
        ),
26
    ]
passerelle/base/models.py
8 8
import sys
9 9
import traceback
10 10
import base64
11
import itertools
11 12

  
12 13
import django
13 14
from django.apps import apps
......
20 21
from django.utils.text import slugify
21 22
from django.utils import timezone, six
22 23
from django.utils.translation import ugettext_lazy as _
24
from django.utils.timezone import now
23 25
from django.core.files.base import ContentFile
24 26

  
25 27
from django.contrib.contenttypes.models import ContentType
......
401 403
        if not self.availability_parameters.run_check:
402 404
            return
403 405

  
404
        currently_down = self.down()
406
        availability_parameters = self.availability_parameters
407

  
405 408
        try:
406 409
            self.check_status()
407 410
            status = 'up'
......
409 412
        except NotImplementedError:
410 413
            return
411 414
        except Exception as e:
415
            from passerelle.utils.conversion import exception_to_text
416

  
412 417
            status = 'down'
413
            message = repr(e)[:500]
418
            message = exception_to_text(e)[:500]
414 419

  
415 420
        resource_type = ContentType.objects.get_for_model(self)
416 421
        current_status = ResourceStatus.objects.filter(
417
                resource_type=resource_type,
418
                resource_pk=self.pk).first()
422
            resource_type=resource_type,
423
            resource_pk=self.pk).first()
419 424
        if not current_status or status != current_status.status:
420
            if status == 'down' and not currently_down:
421
                self.logger.error(u'connector "%s" (%s) is now down', self, self.__class__.__name__)
425
            if status == 'down' and not self.down():  # new downtime
426
                if availability_parameters.has_zero_delay():
427
                    self.logger.error(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message)
428
                else:
429
                    self.logger.warning(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message)
422 430
            ResourceStatus(
423 431
                resource_type=resource_type,
424 432
                resource_pk=self.pk,
425 433
                status=status,
426 434
                message=message).save()
427
            if status == 'up' and currently_down:
435
            if status == 'up' and current_status:
428 436
                self.logger.info(u'connector "%s" (%s) is back up', self, self.__class__.__name__)
429 437
        elif status == 'down':
430
            current_status.message = message
431
            current_status.save()
438
            # check last_notification_downtime and current downtime to see if it matches a new notification delay
439
            last_notification_timestamp = current_status.last_notification_timestamp or current_status.start_timestamp
440

  
441
            current_time = now()
442
            downtime = (current_time - current_status.start_timestamp).total_seconds() // 60
443
            last_notification_downtime = (
444
                last_notification_timestamp - current_status.start_timestamp).total_seconds() // 60
445

  
446
            for delay in availability_parameters.notification_delays_generator():
447
                if not delay:
448
                    continue
449
                if downtime >= delay:
450
                    if last_notification_downtime < delay:
451
                        days = downtime // 1440
452
                        hours = downtime // 60
453
                        if days > 1:
454
                            human_duration = 'for %d days' % days
455
                        elif hours > 1:
456
                            human_duration = 'for %d hours' % hours
457
                        else:
458
                            human_duration = 'for %d minutes' % downtime
459
                        self.logger.error(u'connector "%s" (%s) is still down %s: %s',
460
                                          self, self.__class__.__name__,
461
                                          human_duration,
462
                                          message,
463
                                          # when connector is down, logging is shutdown
464
                                          force=True)
465
                        ResourceStatus.objects.filter(pk=current_status.pk).update(
466
                            message=message, last_notification_timestamp=current_time)
467
                        break
468
                else:
469
                    ResourceStatus.objects.filter(pk=current_status.pk).update(
470
                        message=message)
471
                    break
472
            else:
473
                ResourceStatus.objects.filter(pk=current_status.pk).update(
474
                    message=message)
432 475

  
433 476
    def hourly(self):
434 477
        pass
......
537 580
        unique_together = (('resource_type', 'resource_pk'))
538 581

  
539 582

  
583
def parse_notification_delays(value):
584
    delays = [int(v.strip()) for v in value.split(',')]
585
    if not all(delay >= 0 for delay in delays):
586
        raise ValueError
587
    if delays != sorted(delays):
588
        raise ValueError
589
    if len(set(delays)) != len(delays):
590
        raise ValueError
591
    return delays
592

  
593

  
594
def validate_notification_delays(value):
595
    try:
596
        if not value:
597
            raise ValueError
598
        parse_notification_delays(value)
599
    except ValueError:
600
        raise ValidationError(_('You must provide a list of increasing minutes delays'))
601

  
602

  
540 603
class AvailabilityParameters(models.Model):
541 604
    resource_type = models.ForeignKey(ContentType)
542 605
    resource_pk = models.PositiveIntegerField()
......
544 607
    run_check = models.BooleanField(
545 608
        default=True, verbose_name=_('Run regular availability checks'),
546 609
        help_text=_('Run an availability check every 5 minutes'))
610
    notification_delays = models.TextField(
611
        default='0',
612
        blank=False,
613
        validators=[validate_notification_delays],
614
        help_text=_('Increasing delay between error notifications in minutes, ex.: 0,5,10'))
615

  
616
    def has_zero_delay(self):
617
        return 0 in parse_notification_delays(self.notification_delays)
618

  
619
    def notification_delays_generator(self):
620
        notification_delays = parse_notification_delays(self.notification_delays)
621
        last_notification_delay = notification_delays[-1]
622
        if last_notification_delay > 1:
623
            notification_delays = itertools.chain(
624
                notification_delays,
625
                itertools.count(2 * last_notification_delay, last_notification_delay))
626
        return notification_delays
547 627

  
548 628
    class Meta:
549 629
        unique_together = (('resource_type', 'resource_pk'))
......
609 689
    start_timestamp = models.DateTimeField(auto_now_add=True)
610 690
    status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='unknown')
611 691
    message = models.CharField(max_length=500, blank=True)
692
    last_notification_timestamp = models.DateTimeField(blank=True, null=True)
612 693

  
613 694
    class Meta:
614 695
        ordering = ['-start_timestamp']
......
635 716
        return self._logger.getEffectiveLevel()
636 717

  
637 718
    def _log(self, levelname, message, *args, **kwargs):
638
        if self.connector.down():
719
        force = kwargs.pop('force', False)
720
        if self.connector.down() and not force:
639 721
            # don't log if the connector is known to be down
640 722
            return
641 723
        levelno = getattr(logging, levelname)
passerelle/base/views.py
15 15
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 16

  
17 17
from django.contrib.contenttypes.models import ContentType
18
from django.contrib import messages
18 19
from django.core.urlresolvers import reverse
19 20
from django.core.exceptions import ObjectDoesNotExist, PermissionDenied
20 21
from django.forms import models as model_forms
21 22
from django.views.generic import (
22 23
    DetailView, ListView, CreateView, UpdateView, DeleteView, FormView)
23 24
from django.http import Http404
25
from django.utils.translation import ugettext_lazy as _
24 26

  
25 27
from .models import ApiUser, AccessRight, LoggingParameters, AvailabilityParameters, ResourceStatus
26
from .forms import ApiUserForm, AccessRightForm
28
from .forms import ApiUserForm, AccessRightForm, AvailabilityParametersForm
27 29
from ..utils import get_trusted_services
28 30

  
29 31

  
......
155 157
        return super(LoggingParametersUpdateView, self).form_valid(form)
156 158

  
157 159

  
158
class ManageAvailabilityView(FormView):
160
class ManageAvailabilityView(UpdateView):
159 161
    template_name = 'passerelle/manage/manage_availability_form.html'
162
    form_class = AvailabilityParametersForm
163

  
164
    def get_resource(self):
165
        if not hasattr(self, '_resource'):
166
            content_type = ContentType.objects.get_for_id(self.kwargs['resource_type'])
167
            self._resource = content_type.model_class().objects.get(pk=self.kwargs['resource_pk'])
168
        return self._resource
169

  
170
    def get_object(self, queryset=None):
171
        return self.get_resource().availability_parameters
160 172

  
161 173
    def get_context_data(self, **kwargs):
162 174
        context = super(ManageAvailabilityView, self).get_context_data(**kwargs)
163
        connector = self.get_resource()
164
        context['connector'] = connector
165
        context['availability_status'] = connector.get_availability_status()
175
        context['connector'] = self.get_resource()
176
        context['availability_status'] = self.get_resource().get_availability_status()
166 177
        return context
167 178

  
168
    def get_form_class(self):
169
        form_class = model_forms.modelform_factory(
170
            AvailabilityParameters,
171
            fields=['run_check'])
172
        return form_class
173

  
174
    def get_initial(self):
175
        d = self.initial.copy()
176
        d['resource_type'] = self.kwargs['resource_type']
177
        d['resource_pk'] = self.kwargs['resource_pk']
178
        d['run_check'] = self.get_resource().availability_parameters.run_check
179
        return d
180

  
181
    def get_resource(self):
182
        content_type = ContentType.objects.get_for_id(self.kwargs['resource_type'])
183
        return content_type.model_class().objects.get(pk=self.kwargs['resource_pk'])
184

  
185 179
    def get_success_url(self):
186 180
        return self.get_resource().get_absolute_url()
187 181

  
188 182
    def form_valid(self, form):
189 183
        resource = self.get_resource()
190
        parameters = resource.availability_parameters
191
        run_check = form.cleaned_data['run_check']
192 184

  
193
        if not run_check and resource.down():
185
        if not form.instance.run_check and resource.down():
194 186
            resource_type = ContentType.objects.get_for_model(resource)
195 187
            ResourceStatus(
196 188
                resource_type=resource_type,
......
198 190
                status='up',
199 191
                message='').save()
200 192

  
201
        if parameters.run_check != run_check:
202
            parameters.run_check = run_check
203
            parameters.save()
204
            resource.logger.info(u'availability checks %s', 'enabled' if run_check else 'disabled')
193
        if form.initial['notification_delays'] != form.cleaned_data['notification_delays']:
194
            resource.logger.info(u'availability checks delays set to %s', form.instance.notification_delays)
195
        elif 'notification_delays' in form.changed_data:
196
            messages.warning(self.request, _(
197
                'Notification delays must be a list of increasing delays between '
198
                'error notifications in minutes, ex.: 0,5,10'))
199

  
200
        response = super(ManageAvailabilityView, self).form_valid(form)
201

  
202
        if 'run_check' in form.changed_data:
203
            resource.logger.info(u'availability checks %s', 'enabled' if form.instance.run_check else 'disabled')
204
            if form.instance.run_check:
205
                resource.availability()
205 206

  
206
        return super(ManageAvailabilityView, self).form_valid(form)
207
        return response
tests/test_proxylogger.py
1 1
# -*- coding: utf-8 -*-
2 2

  
3 3
import logging
4
import datetime
5
import itertools
4 6

  
5 7
import pytest
6
from httmock import all_requests, HTTMock
8
from httmock import HTTMock
9

  
10
from django.core.exceptions import ValidationError
7 11

  
8 12
from passerelle.base.models import ProxyLogger, ResourceLog
9 13
from passerelle.apps.feeds.models import Feed
......
66 70
    pr.debug(u'some message')
67 71
    assert len(ResourceLog.objects.all()) == 0
68 72

  
69
def test_log_on_connector_availability_change(db, connector):
73

  
74
def test_validate_notification_delays(db, connector):
75
    def take(iterator, count):
76
        return list(itertools.compress(iterator, range(1, count + 1)))
77

  
78
    availability_parameters = connector.availability_parameters
79
    assert availability_parameters.notification_delays == '0'
80

  
81
    availability_parameters.full_clean()
82
    assert availability_parameters.has_zero_delay()
83
    assert list(availability_parameters.notification_delays_generator()) == [0]
84

  
85
    availability_parameters.notification_delays = '0,5,100,1000'
86
    availability_parameters.full_clean()
87
    assert availability_parameters.has_zero_delay()
88
    assert take(availability_parameters.notification_delays_generator(), 6) == [0, 5, 100, 1000, 2000, 3000]
89

  
90
    availability_parameters.notification_delays = '5,100,1000'
91
    availability_parameters.full_clean()
92
    assert not availability_parameters.has_zero_delay()
93
    assert take(availability_parameters.notification_delays_generator(), 6) == [5, 100, 1000, 2000, 3000, 4000]
94

  
95
    availability_parameters.notification_delays = '5'
96
    availability_parameters.full_clean()
97
    assert not availability_parameters.has_zero_delay()
98
    assert take(availability_parameters.notification_delays_generator(), 3) == [5, 10, 15]
99

  
100
    # validation errors
101
    availability_parameters.notification_delays = '5,100,100'
102
    with pytest.raises(ValidationError):
103
        availability_parameters.full_clean()
104

  
105
    availability_parameters.notification_delays = ''
106
    with pytest.raises(ValidationError):
107
        availability_parameters.full_clean()
108

  
109
    availability_parameters.notification_delays = '1,x'
110
    with pytest.raises(ValidationError):
111
        availability_parameters.full_clean()
112

  
113
    availability_parameters.notification_delays = '10,1'
114
    with pytest.raises(ValidationError):
115
        availability_parameters.full_clean()
116

  
117

  
118
@pytest.mark.parametrize('notification_delays', ['0', '0,5,100', '5,100'])
119
def test_log_on_connector_availability_change(db, connector, freezer, notification_delays):
70 120
    connector.title = u'éléphant'
121
    availability_parameters = connector.availability_parameters
122
    availability_parameters.notification_delays = notification_delays
123
    availability_parameters.save()
124

  
71 125
    with HTTMock(up_mock):  # set connector as up
72 126
        connector.availability()
127
    assert ResourceLog.objects.count() == 1
128
    assert ResourceLog.objects.latest('id').level == 'info'
129
    assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)'
73 130
    ResourceLog.objects.all().delete()
131

  
132
    # move 5 minutes in the future
133
    freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1))
134

  
74 135
    with HTTMock(down_mock):  # set connector as down
75 136
        connector.availability()
76
    assert len(ResourceLog.objects.all()) == 2
137
    assert connector.down()
138
    last_count1 = ResourceLog.objects.count()
139
    assert last_count1 == 2
77 140
    assert ResourceLog.objects.all()[0].message == 'GET http://example.net/ (=> 404)'
78
    assert ResourceLog.objects.all()[1].level == 'error'
79
    assert ResourceLog.objects.all()[1].message == u'connector "éléphant" (Feed) is now down'
141
    assert ResourceLog.objects.all()[1].level == 'error' if notification_delays.startswith('0') else 'warning'
142
    assert (u'connector "éléphant" (Feed) is now down: 404 Client'
143
            in ResourceLog.objects.all()[1].message)
144

  
145
    # move 5 minutes in the future
146
    freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1))
147

  
148
    # second time log as error
149
    with HTTMock(down_mock):  # connector is still down
150
        connector.availability()
151
    assert connector.down()
152
    last_count2 = ResourceLog.objects.count()
153
    if notification_delays == '0':
154
        assert last_count2 == last_count1
155
    else:
156
        assert last_count2 == last_count1 + 1
157
        assert ResourceLog.objects.all()[2].level == 'error' if notification_delays != '0' else 'warning'
158
        assert (u'connector "éléphant" (Feed) is still down for 5 minutes: 404'
159
                in ResourceLog.objects.all()[2].message)
160

  
161
    # move 3 minutes in the future
162
    freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1))
163

  
164
    # third time no log
165
    with HTTMock(down_mock):  # connector is still down
166
        connector.availability()
167
    assert ResourceLog.objects.count() == last_count2
168

  
169
    # move 3 minutes in the future
170
    freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1))
171

  
172
    with HTTMock(up_mock):  # set connector as up
173
        connector.availability()
174
    assert not connector.down()
175
    last_count3 = ResourceLog.objects.count()
176
    assert last_count3 == last_count2 + 1
177
    assert ResourceLog.objects.latest('id').level == 'info'
178
    assert ResourceLog.objects.latest('id').message == u'connector "éléphant" (Feed) is back up'
179

  
180
    # move 3 minutes in the future
181
    freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1))
80 182

  
183
    # last message is a GET
81 184
    with HTTMock(up_mock):  # set connector as up
82 185
        connector.availability()
83
    assert len(ResourceLog.objects.all()) == 3
84
    assert ResourceLog.objects.all()[2].level == 'info'
85
    assert ResourceLog.objects.all()[2].message == u'connector "éléphant" (Feed) is back up'
186
    assert not connector.down()
187
    last_count4 = ResourceLog.objects.count()
188
    assert last_count4 == last_count3 + 1
189
    assert ResourceLog.objects.latest('id').level == 'info'
190
    assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)'
86
-