From 254cbe01fae239ff1e5b450deb379e509d115418 Mon Sep 17 00:00:00 2001 From: Benjamin Dauvergne Date: Fri, 8 Jan 2021 12:00:46 +0100 Subject: [PATCH 4/4] custom_user: specialize free_text_search for common search terms (#49957) --- src/authentic2/custom_user/managers.py | 113 ++++++++++++++----------- src/authentic2/utils/date.py | 33 ++++++++ tests/test_custom_user.py | 78 +++++++++++++++++ 3 files changed, 175 insertions(+), 49 deletions(-) create mode 100644 src/authentic2/utils/date.py diff --git a/src/authentic2/custom_user/managers.py b/src/authentic2/custom_user/managers.py index b1b9ffd3..8c2ee6ac 100644 --- a/src/authentic2/custom_user/managers.py +++ b/src/authentic2/custom_user/managers.py @@ -17,68 +17,82 @@ import datetime import logging import unicodedata +import uuid from django.contrib.contenttypes.models import ContentType from django.contrib.postgres.search import TrigramDistance +from django.core.exceptions import ValidationError from django.db import models, transaction, connection -from django.db.models import F, Value, FloatField, Subquery, OuterRef +from django.db.models import F, Value, FloatField, Subquery, OuterRef, Q from django.db.models.functions import Lower, Coalesce -from django.utils import six from django.utils import timezone from django.contrib.auth.models import BaseUserManager +from django.contrib.postgres.search import SearchQuery from authentic2 import app_settings -from authentic2.models import Attribute, AttributeValue, UserExternalId +from authentic2.models import AttributeValue, UserExternalId from authentic2.utils.lookups import Unaccent, ImmutableConcat +from authentic2.utils.date import parse_date +from authentic2.attribute_kinds import clean_number class UserQuerySet(models.QuerySet): - def free_text_search(self, search): - terms = search.split() - - if not terms: - return self - - searchable_attributes = Attribute.objects.filter(searchable=True) - queries = [] - for term in terms: - q = None - - specific_queries = [] - for a in searchable_attributes: - kind = a.get_kind() - free_text_search_function = kind.get('free_text_search') - if free_text_search_function: - q = free_text_search_function(term) - if q is not None: - specific_queries.append(q & models.query.Q(attribute_values__attribute=a)) - - # if the term is recognized by some specific attribute type, like a - # date, does not use the later generic matcher - if specific_queries: - queries.append(six.moves.reduce(models.query.Q.__or__, specific_queries)) - continue - - q = ( - models.query.Q(username__icontains=term) - | models.query.Q(first_name__icontains=term) - | models.query.Q(last_name__icontains=term) - | models.query.Q(email__icontains=term) - ) - for a in searchable_attributes: - if a.name in ('first_name', 'last_name'): - continue - q = q | models.query.Q( - attribute_values__content__icontains=term, attribute_values__attribute=a) - queries.append(q) - self = self.filter(six.moves.reduce(models.query.Q.__and__, queries)) - # search by attributes can match multiple times - if searchable_attributes: - self = self.distinct() - return self - - def find_duplicates(self, first_name=None, last_name=None, fullname=None, birthdate=None): + search = search.strip() + + if len(search) == 0: + return self.none() + + if '@' in search and len(search.split()) == 1: + qs = self.filter(email__iexact=search).order_by('last_name', 'first_name') + if qs.exists(): + return qs + + try: + guid = uuid.UUID(search) + except ValueError: + pass + else: + return self.filter(uuid=guid.hex) + + try: + phone_number = clean_number(search) + except ValidationError: + pass + else: + attribute_values = AttributeValue.objects.filter( + search_vector=SearchQuery(phone_number), attribute__kind='phone_number') + qs = self.filter(attribute_values__in=attribute_values).order_by('last_name', 'first_name') + if qs.exists(): + return qs + + try: + date = parse_date(search) + except ValueError: + pass + else: + attribute_values = AttributeValue.objects.filter( + search_vector=SearchQuery(date.isoformat()), attribute__kind='birthdate') + qs = self.filter(attribute_values__in=attribute_values).order_by('last_name', 'first_name') + if qs.exists(): + return qs + + qs = self.find_duplicates(fullname=search, limit=None) + extra_user_ids = set() + attribute_values = AttributeValue.objects.filter(search_vector=SearchQuery(search), attribute__searchable=True) + extra_user_ids.update(self.filter(attribute_values__in=attribute_values).values_list('id', flat=True)) + if len(search.split()) == 1: + extra_user_ids.update( + self.filter( + Q(username__istartswith=search) + | Q(email__istartswith=search) + ).values_list('id', flat=True)) + if extra_user_ids: + qs = qs | self.filter(id__in=extra_user_ids) + qs = qs.order_by('dist', 'last_name', 'first_name') + return qs + + def find_duplicates(self, first_name=None, last_name=None, fullname=None, birthdate=None, limit=5): with connection.cursor() as cursor: cursor.execute( "SET pg_trgm.similarity_threshold = %f" % app_settings.A2_DUPLICATES_THRESHOLD @@ -96,7 +110,8 @@ class UserQuerySet(models.QuerySet): qs = qs.filter(name__trigram_similar=name) qs = qs.annotate(dist=TrigramDistance('name', name)) qs = qs.order_by('dist') - qs = qs[:5] + if limit is not None: + qs = qs[:limit] # alter distance according to additionnal parameters if birthdate: diff --git a/src/authentic2/utils/date.py b/src/authentic2/utils/date.py new file mode 100644 index 00000000..96f9c9f3 --- /dev/null +++ b/src/authentic2/utils/date.py @@ -0,0 +1,33 @@ +# authentic2 - versatile identity manager +# Copyright (C) 2010-2020 Entr'ouvert +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from datetime import datetime + +from django.utils import formats + + +def parse_date(formatted_date): + parsed_date = None + for date_format in formats.get_format('DATE_INPUT_FORMATS'): + try: + parsed_date = datetime.strptime(formatted_date, date_format) + except ValueError: + continue + else: + break + if not parsed_date: + raise ValueError + return parsed_date.date() diff --git a/tests/test_custom_user.py b/tests/test_custom_user.py index 870baaca..dade33ea 100644 --- a/tests/test_custom_user.py +++ b/tests/test_custom_user.py @@ -14,10 +14,15 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +from datetime import date + from django.contrib.auth import get_user_model +from authentic2.models import Attribute from django_rbac.utils import get_permission_model, get_role_model +import pytest + Permission = get_permission_model() Role = get_role_model() User = get_user_model() @@ -59,3 +64,76 @@ def test_user_mark_as_deleted(db): User.objects.create(username='foo2', email='foo@example.net') assert len(User.objects.filter(email='foo@example.net')) == 2 assert len(User.objects.filter(email='foo@example.net', deleted__isnull=True)) == 1 + + +@pytest.fixture +def fts(db): + Attribute.objects.create(name='adresse', label='adresse', searchable=True, kind='string') + Attribute.objects.create(name='telephone', label='telephone', searchable=True, kind='phone_number') + Attribute.objects.create(name='dob', label='dob', searchable=True, kind='birthdate') + user1 = User.objects.create( + username='foo1234', + first_name='Jo', + last_name='darmettein', + email='jean.darmette@example.net' + ) + user2 = User.objects.create( + username='bar1234', + first_name='Lea', + last_name='darmettein', + email='micheline.darmette@example.net' + ) + user3 = User.objects.create( + first_name='', + last_name='peuplier', + ) + user1.attributes.adresse = '4 rue des peupliers 13001 MARSEILLE' + user2.attributes.adresse = '4 rue des peupliers 13001 MARSEILLE' + user1.attributes.telephone = '0601020304' + user2.attributes.telephone = '0601020305' + user1.attributes.dob = date(1970, 1, 1) + user2.attributes.dob = date(1972, 2, 2) + return locals() + + +def test_fts_uuid(fts): + assert User.objects.free_text_search(fts['user1'].uuid).count() == 1 + assert User.objects.free_text_search(fts['user2'].uuid).count() == 1 + + +def test_fts_phone(fts): + assert list(User.objects.free_text_search('0601020304')) == [fts['user1']] + assert list(User.objects.free_text_search('0601020305')) == [fts['user2']] + + +def test_fts_dob(fts): + assert User.objects.free_text_search('01/01/1970').count() == 1 + assert User.objects.free_text_search('02/02/1972').count() == 1 + assert User.objects.free_text_search('03/03/1973').count() == 0 + + +def test_fts_email(fts): + assert User.objects.free_text_search('jean.darmette@example.net').count() == 1 + assert User.objects.free_text_search('micheline.darmette@example.net').count() == 1 + + +def test_fts_username(fts): + assert User.objects.free_text_search('foo1234').count() == 1 + assert User.objects.free_text_search('bar1234').count() == 1 + + +def test_fts_trigram(fts): + assert User.objects.free_text_search('darmettein').count() == 2 + # dist attribute signals queryset from find_duplicates() + assert hasattr(User.objects.free_text_search('darmettein')[0], 'dist') + + assert User.objects.free_text_search('lea darmettein').count() == 1 + assert hasattr(User.objects.free_text_search('darmettein')[0], 'dist') + + +def test_fts_legacy(fts): + assert User.objects.free_text_search('rue des peupliers').count() == 2 + + +def test_fts_legacy_and_trigram(fts): + assert User.objects.free_text_search('peuplier').count() == 3 -- 2.29.2