From 584b0ba9cd27a1045078cbc03d01ec9c337ab349 Mon Sep 17 00:00:00 2001 From: Benjamin Dauvergne Date: Fri, 7 Jun 2019 14:49:39 +0200 Subject: [PATCH 2/2] update and cache metadata from URL and path (#10196) --- README | 34 +++++- debian/control | 6 +- mellon/adapters.py | 190 +++++++++++++++++++++++++++++----- mellon/app_settings.py | 2 + mellon/utils.py | 5 +- setup.py | 1 + tests/conftest.py | 26 ++++- tests/test_default_adapter.py | 68 +++++++++++- tests/test_utils.py | 13 +-- tox.ini | 4 +- 10 files changed, 305 insertions(+), 44 deletions(-) diff --git a/README b/README index a06e1e9..4e2dd8f 100644 --- a/README +++ b/README @@ -76,11 +76,23 @@ overridden in the identity provider settings by removing the MELLON_IDENTITY_PROVIDERS ------------------------- -A list of dictionaries, only one key is mandatory in those -dictionaries `METADATA` it should contain the UTF-8 content of the -metadata file of the identity provider or if it starts with a slash -the absolute path toward a metadata file. All other keys are override -of generic settings. +A list of dictionaries, they must contain at least one of the keys `METADATA` +(inline copy of the identity provider metadata), `METADATA_URL` URL of the IdP +metadata file, or `METADATA_PATH` an absolute path to the IdP metadata file.. +All other keys are override of generic settings. + +When using an URL, the URL is automatically cached in the `MEDIA_ROOT` +directory of your application in the directory named `mellon_metadata_cache`. +If you restart the application and the URL is unavailable, the file cache will +be used. The cache will be refreshed every `MELLON_METADATA_CACHE_TIME` seconds. +If the HTTP retrieval of the metadata URL takes longer thant +`METTON_METADATA_HTTP_TIMEOUT` seconds, retrieval will be skipped. + +When the cache is already loaded, retrievals are done in the background by a +thread. + +When using a local absolute path, the metadata is reloaded each time the +modification time of the file is superior to the last time it was loaded. MELLON_PUBLIC_KEYS ------------------ @@ -261,6 +273,18 @@ MELLON_DEFAULT_ASSERTION_CONSUMER_BINDING Should be post or artifact. Default is post. You can refer to the SAML 2.0 specification to learn the difference. +MELLON_METADATA_CACHE_TIME +-------------------------- + +When using METADATA_URL to reference a metadata file, it's the duration in +secondes between refresh of the metadata file. Default is 3600 seconds, 1 hour. + +METTON_METADATA_HTTP_TIMEOUT +--------------------------- + +Timeout in seconds for HTTP call made to retrieve metadata files. Default is 10 +seconds. + Tests ===== diff --git a/debian/control b/debian/control index dd7f882..340f7a5 100644 --- a/debian/control +++ b/debian/control @@ -15,7 +15,8 @@ Depends: ${misc:Depends}, ${python:Depends}, python (>= 2.7), python-django (>= 1.5), python-isodate, - python-lasso + python-lasso, + python-atomicwrites Breaks: python-hobo (<< 0.34.5) Description: SAML authentication for Django @@ -24,5 +25,6 @@ Architecture: all Depends: ${misc:Depends}, ${python:Depends}, python3-django (>= 1.5), python3-isodate, - python3-lasso + python3-lasso, + python3-atomicwrites Description: SAML authentication for Django diff --git a/mellon/adapters.py b/mellon/adapters.py index a402896..1f4a3e4 100644 --- a/mellon/adapters.py +++ b/mellon/adapters.py @@ -13,22 +13,31 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +from xml.etree import ElementTree as ET +import hashlib import logging +import os +import threading +import time import uuid -from xml.etree import ElementTree as ET import lasso import requests import requests.exceptions +from atomicwrites import atomic_write from django.core.exceptions import PermissionDenied +from django.core.files.storage import default_storage from django.contrib import auth from django.contrib.auth.models import Group from django.utils import six from django.utils.encoding import force_text +from django.utils.six.moves.urllib.parse import urlparse from . import utils, app_settings, models +logger = logging.getLogger(__name__) + class UserCreationError(Exception): pass @@ -49,40 +58,169 @@ class DefaultAdapter(object): def get_idps(self): for i, idp in enumerate(self.get_identity_providers_setting()): - if 'METADATA_URL' in idp and 'METADATA' not in idp: + if self.load_idp(idp, i): + yield idp + + def load_metadata_path(self, idp, i): + path = idp['METADATA_PATH'] + if not os.path.exists(path): + logger.warning('metadata path %s does not exist', path) + return + last_update = idp.get('METADATA_PATH_LAST_UPDATE', 0) + try: + mtime = os.stat(path).st_mtime + except OSError as e: + logger.warning('metadata path %s : stat() call failed, %s', path, e) + return + if last_update == 0 or mtime >= last_update: + idp['METADATA_PATH_LAST_UPDATE'] = time.time() + try: + with open(path) as fd: + metadata = fd.read() + except OSError as e: + logger.warning('metadata path %s : open()/read() call failed, %s', path, e) + return + entity_id = self.load_entity_id(metadata, i) + if not entity_id: + logger.error('invalid metadata file retrieved from %s', path) + return + if 'ENTITY_ID' in idp and idp['ENTITY_ID'] != entity_id: + logger.error('metadata path %s : entityID changed %r != %r', path, entity_id, idp['ENTITY_ID']) + del idp['ENTITY_ID'] + idp['METADATA'] = metadata + + def load_metadata_url(self, idp, i): + url = idp['METADATA_URL'] + try: + hostname = urlparse(url).hostname + except (ValueError, TypeError) as e: + logger.error('invalid METADATA_URL %r: %s', url, e) + return + if not hostname: + logger.error('no hostname in METADATA_URL %r: %s', url) + return + last_update = idp.get('METADATA_URL_LAST_UPDATE', 0) + metadata_cache_time = utils.get_setting(idp, 'METADATA_CACHE_TIME') + timeout = utils.get_setting(idp, 'METADATA_HTTP_TIMEOUT') + now = time.time() + + try: + url_fingerprint = hashlib.md5(url.encode('ascii')).hexdigest() + file_cache_key = '%s_%s.xml' % (hostname, url_fingerprint) + except (UnicodeError, TypeError, ValueError): + logger.exception('unable to compute file_cache_key') + return + + cache_directory = default_storage.path('mellon_metadata_cache') + file_cache_path = os.path.join(cache_directory, file_cache_key) + + if not os.path.exists(cache_directory): + os.makedirs(cache_directory) + + if os.path.exists(file_cache_path) and 'METADATA' not in idp: + try: + with open(file_cache_path) as fd: + idp['METADATA'] = fd.read() + except OSError: + pass + + # fresh cache, skip loading + if last_update and 'METADATA' in idp and (now - last_update) < metadata_cache_time: + return + + def __http_get(): + try: verify_ssl_certificate = utils.get_setting( idp, 'VERIFY_SSL_CERTIFICATE') try: - response = requests.get(idp['METADATA_URL'], verify=verify_ssl_certificate) + response = requests.get(url, verify=verify_ssl_certificate, timeout=timeout) response.raise_for_status() except requests.exceptions.RequestException as e: self.logger.error( u'retrieval of metadata URL %r failed with error %s for %d-th idp', - idp['METADATA_URL'], e, i) - continue + url, e, i) + try: + with open(file_cache_path) as fd: + pass + idp['METADATA_PATH'] = file_cache_path + self.load_metadata_path() + except IOError: + pass + return + entity_id = self.load_entity_id(response.text, i) + if not entity_id: + logger.error('invalid metadata file retrieved from %s', url) + return + if 'ENTITY_ID' in idp and idp['ENTITY_ID'] != entity_id: + logger.error('metadata url %s : entityID changed %r != %r', url, entity_id, idp['ENTITY_ID']) + del idp['ENTITY_ID'] idp['METADATA'] = response.text - elif 'METADATA' in idp: - if idp['METADATA'].startswith('/'): - idp['METADATA'] = open(idp['METADATA']).read() - else: - self.logger.error(u'missing METADATA or METADATA_URL in %d-th idp', i) - continue + idp['METADATA_URL_LAST_UPDATE'] = now + with atomic_write(file_cache_path, mode='wb', overwrite=True) as fd: + fd.write(response.text.encode('utf-8')) + idp['METADATA_PATH'] = file_cache_path + idp['METADATA_PATH_LAST_UPDATE'] = time.time() + 1 + idp.pop('METADATA_URL_UPDATE_THREAD', None) + logger.debug('metadata url %s : update throught HTTP', url) + finally: + stale_timeout = 24 * metadata_cache_time + if last_update and (now - idp['METADATA_URL_LAST_UPDATE']) > stale_timeout: + logger.error('metadata url %s : not updated since %.1f hours', + stale_timeout / 3600.0) + + # we have cache, update in background + if last_update and 'METADATA' in idp: + t = threading.Thread(target=__http_get) + t.start() + # suspend updates for HTTP timeout + 5 seconds + idp['METADATA_URL_UPDATE_THREAD'] = t + idp['METADATA_URL_LAST_UPDATE'] = last_update + timeout + 5 + else: + # synchronous update + __http_get() + + def load_metadata(self, idp, i): + # legacy support + if 'METADATA' in idp and idp['METADATA'].startswith('/'): + idp['METADATA_PATH'] = idp['METADATA'] + del idp['METADATA'] + + if 'METADATA_PATH' in idp: + self.load_metadata_path(idp, i) + + if 'METADATA_URL' in idp: + self.load_metadata_url(idp, i) + + if 'METADATA' in idp: if 'ENTITY_ID' not in idp: - try: - doc = ET.fromstring(idp['METADATA']) - except (TypeError, ET.ParseError): - self.logger.error(u'METADATA of %d-th idp is invalid', i) - continue - if doc.tag != '{%s}EntityDescriptor' % lasso.SAML2_METADATA_HREF: - self.logger.error(u'METADATA of %d-th idp has no EntityDescriptor root tag', i) - continue - - if 'entityID' not in doc.attrib: - self.logger.error( - u'METADATA of %d-th idp has no entityID attribute on its root tag', i) - continue - idp['ENTITY_ID'] = doc.attrib['entityID'] - yield idp + entity_id = self.load_entity_id(idp['METADATA'], i) + if entity_id: + idp['ENTITY_ID'] = entity_id + return idp['METADATA'] + + def load_entity_id(self, metadata, i): + try: + doc = ET.fromstring(metadata) + except (TypeError, ET.ParseError): + self.logger.error(u'METADATA of %d-th idp is invalid', i) + return None + if doc.tag != '{%s}EntityDescriptor' % lasso.SAML2_METADATA_HREF: + self.logger.error(u'METADATA of %d-th idp has no EntityDescriptor root tag', i) + return None + + if 'entityID' not in doc.attrib: + self.logger.error( + u'METADATA of %d-th idp has no entityID attribute on its root tag', i) + return None + return doc.attrib['entityID'] + + def load_idp(self, idp, i): + metadata = self.load_metadata(idp, i) + if not metadata: + self.logger.error(u'missing METADATA or METADATA_URL in %d-th idp', i) + return False + + return 'ENTITY_ID' in idp def authorize(self, idp, saml_attributes): if not idp: diff --git a/mellon/app_settings.py b/mellon/app_settings.py index ae095bd..5091097 100644 --- a/mellon/app_settings.py +++ b/mellon/app_settings.py @@ -40,6 +40,8 @@ class AppSettings(object): 'ARTIFACT_RESOLVE_TIMEOUT': 10.0, 'LOGIN_HINTS': [], 'SIGNATURE_METHOD': 'RSA-SHA256', + 'METADATA_CACHE_TIME': 3600, + 'METADATA_HTTP_TIMEOUT': 10, } @property diff --git a/mellon/utils.py b/mellon/utils.py index b095b1a..c58efc5 100644 --- a/mellon/utils.py +++ b/mellon/utils.py @@ -276,7 +276,10 @@ def get_xml_encoding(content): xml_encoding[0] = encoding parser = expat.ParserCreate() parser.XmlDeclHandler = xmlDeclHandler - parser.Parse(content, True) + try: + parser.Parse(content, True) + except expat.ExpatError as e: + raise ValueError('invalid XML %s' % e) return xml_encoding[0] diff --git a/setup.py b/setup.py index fb0fc00..4d719c5 100755 --- a/setup.py +++ b/setup.py @@ -94,6 +94,7 @@ setup(name="django-mellon", 'django>=1.5,<2.0', 'requests', 'isodate', + 'atomicwrites', ], setup_requires=[ 'django>=1.5,<2.0', diff --git a/tests/conftest.py b/tests/conftest.py index bfa8788..885175e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,18 @@ +import os import logging + import pytest import django_webtest +@pytest.fixture(autouse=True) +def settings(settings, tmpdir): + settings.MEDIA_ROOT = str(tmpdir.mkdir('media')) + return settings + + @pytest.fixture -def app(request): +def app(request, settings): wtm = django_webtest.WebTestMixin() wtm._patch_settings() request.addfinalizer(wtm._unpatch_settings) @@ -23,7 +31,7 @@ def concurrency(settings): @pytest.fixture -def private_settings(request): +def private_settings(request, tmpdir): import django.conf from django.conf import UserSettingsHolder old = django.conf.settings._wrapped @@ -42,3 +50,17 @@ def caplog(caplog): caplog.handler.stream = py.io.TextIO() caplog.handler.records = [] return caplog + + +@pytest.fixture(scope='session') +def metadata(): + with open(os.path.join(os.path.dirname(__file__), 'metadata.xml')) as fd: + yield fd.read() + + +@pytest.fixture +def metadata_path(tmpdir, metadata): + metadata_path = tmpdir / 'metadata.xml' + with metadata_path.open('w') as fd: + fd.write(metadata) + yield str(metadata_path) diff --git a/tests/test_default_adapter.py b/tests/test_default_adapter.py index 8cf4382..a28ad28 100644 --- a/tests/test_default_adapter.py +++ b/tests/test_default_adapter.py @@ -1,8 +1,13 @@ -import pytest + + +import datetime import re import lasso +import time from multiprocessing.pool import ThreadPool +import pytest + from django.contrib import auth from django.db import connection @@ -196,3 +201,64 @@ def test_lookup_user_transient_with_email(private_settings): user = adapter.lookup_user(idp, saml_attributes) assert user is None assert User.objects.count() == 0 + + +@pytest.fixture +def adapter(): + return DefaultAdapter() + + +def test_load_metadata_simple(adapter, metadata): + idp = {'METADATA': metadata} + assert adapter.load_metadata(idp, 0) == metadata + + +def test_load_metadata_legacy(adapter, metadata_path, metadata): + idp = {'METADATA': metadata_path} + assert adapter.load_metadata(idp, 0) == metadata + assert idp['METADATA'] == metadata + + +def test_load_metadata_path(adapter, metadata_path, metadata, freezer): + now = time.time() + idp = {'METADATA_PATH': str(metadata_path)} + assert adapter.load_metadata(idp, 0) == metadata + assert idp['METADATA'] == metadata + assert idp['METADATA_PATH_LAST_UPDATE'] == now + + +def test_load_metadata_url(settings, adapter, metadata, httpserver, freezer, caplog): + now = time.time() + httpserver.serve_content(content=metadata, headers={'Content-Type': 'application/xml'}) + idp = {'METADATA_URL': httpserver.url} + assert adapter.load_metadata(idp, 0) == metadata + assert idp['METADATA'] == metadata + assert idp['METADATA_URL_LAST_UPDATE'] == now + assert 'METADATA_PATH' in idp + assert idp['METADATA_PATH'].startswith(settings.MEDIA_ROOT) + with open(idp['METADATA_PATH']) as fd: + assert fd.read() == metadata + assert idp['METADATA_PATH_LAST_UPDATE'] == now + 1 + httpserver.serve_content(content=metadata.replace('idp5', 'idp6'), + headers={'Content-Type': 'application/xml'}) + assert adapter.load_metadata(idp, 0) == metadata + + freezer.move_to(datetime.timedelta(seconds=3601)) + caplog.clear() + assert adapter.load_metadata(idp, 0) == metadata + # wait for update thread to finish + try: + idp['METADATA_URL_UPDATE_THREAD'].join() + except KeyError: + pass + new_meta = adapter.load_metadata(idp, 0) + assert new_meta != metadata + assert new_meta == metadata.replace('idp5', 'idp6') + assert 'entityID changed' in caplog.records[-1].message + assert caplog.records[-1].levelname == 'ERROR' + # test load from file cache + del idp['METADATA'] + del idp['METADATA_PATH'] + del idp['METADATA_PATH_LAST_UPDATE'] + httpserver.serve_content(content='', headers={'Content-Type': 'application/xml'}) + assert adapter.load_metadata(idp, 0) == metadata.replace('idp5', 'idp6') diff --git a/tests/test_utils.py b/tests/test_utils.py index 8c4cf9c..4f19ace 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -67,17 +67,18 @@ def test_create_server_invalid_metadata_file(mocker, rf, private_settings, caplo assert len(server.providers) == 0 -def test_create_server_good_metadata_file(mocker, rf, private_settings, caplog): +def test_create_server_good_metadata_file(mocker, rf, private_settings, tmpdir, caplog): + path = tmpdir / 'metadata.xml' + with path.open('w') as fd: + fd.write(open('tests/metadata.xml').read()) + private_settings.MELLON_IDENTITY_PROVIDERS = [ { - 'METADATA': '/xxx', + 'METADATA': str(path), } ] request = rf.get('/') - with mock.patch( - 'mellon.adapters.open', mock.mock_open(read_data=open('tests/metadata.xml').read()), - create=True): - server = create_server(request) + server = create_server(request) assert 'ERROR' not in caplog.text assert len(server.providers) == 1 diff --git a/tox.ini b/tox.ini index a2a945a..9a87536 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = {coverage-,}py2-{dj18,dj111}-{pg,sqlite},py3-dj111-{pg,sqlite} +envlist = coverage-py2-{dj18,dj111}-{pg,sqlite},coverage-py3-dj111-{pg,sqlite} toxworkdir = {env:TMPDIR:/tmp}/tox-{env:USER}/django-mellon/ [testenv] @@ -24,6 +24,8 @@ deps = pytest-random pytest-mock pytest-django + pytest-freezegun + pytest-localserver pytz lxml cssselect -- 2.20.1