From cfe6247ceeff6ae3cc8b2d4117c82c514fd7c889 Mon Sep 17 00:00:00 2001 From: Benjamin Dauvergne Date: Sun, 12 Jan 2020 21:57:22 +0100 Subject: [PATCH 2/2] =?UTF-8?q?nanterre:=20diminuer=20la=20similarit=C3=A9?= =?UTF-8?q?=20pour=20les=20membres=20d'une=20m=C3=AAme=20famille=20(#37038?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_nanterre_doublons.py | 17 ++++++++++++++++ zoo/zoo_nanterre/duplicates.py | 35 +++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/tests/test_nanterre_doublons.py b/tests/test_nanterre_doublons.py index 255623e..9f06ea5 100644 --- a/tests/test_nanterre_doublons.py +++ b/tests/test_nanterre_doublons.py @@ -163,3 +163,20 @@ def test_doublons_cmd(lot_of_names): call_command('rsu-duplicates', 'find') assert Duplicate.objects.count() < (Entity.objects.filter(schema__slug='individu').count() / 5) call_command('rsu-duplicates', 'list') + + +def test_ignore_siblings(nanterre_classic_family, settings): + # by moving the birthdate of kevin to undefined (less than 1903) + # and changing the first name of keving to JEANNOT looking like its father + # first name JEAN, we find kevin and jean are potential duplicates + nanterre_classic_family['kevin'].content['prenoms'] = 'JEANNOT' + nanterre_classic_family['kevin'].content['date_de_naissance'] = '1901-01-01' + nanterre_classic_family['kevin'].save() + settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 1.0 + call_command('rsu-duplicates', 'find') + assert Duplicate.objects.count() == 1 + + # if we lower the sibling factor to 0.9, the duplicate is now ignored + settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 0.9 + call_command('rsu-duplicates', 'find') + assert Duplicate.objects.count() == 0 diff --git a/zoo/zoo_nanterre/duplicates.py b/zoo/zoo_nanterre/duplicates.py index 23a73ff..37383b5 100644 --- a/zoo/zoo_nanterre/duplicates.py +++ b/zoo/zoo_nanterre/duplicates.py @@ -14,6 +14,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import collections from decimal import Decimal import datetime @@ -23,15 +24,16 @@ from django.utils.timezone import now from django.conf import settings from zoo.utils import strip_accents -from zoo.zoo_data.models import Entity +from zoo.zoo_data.models import Entity, Relation from .models import Duplicate -from .utils import pair_sort, PersonSearch +from .utils import pair_sort, PersonSearch, UNION_REL, RESPONSABILITE_LEGALE_REL @atomic def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count=None, ids=None, progression=False): # Define search space + sibling_factor = limit or getattr(settings, 'ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR', 0.6) limit = limit or getattr(settings, 'ZOO_NANTERRE_DUPLICATES_THRESHOLD', 0.7) base_limit = base_limit or limit / 2.0 qs = queryset or Entity.objects.all() @@ -49,8 +51,28 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count new = set() new_duplicates = [] + conjoints = set() + for rel in Relation.objects.filter(schema__slug=UNION_REL): + conjoints .add(frozenset([rel.left_id, rel.right_id])) + parents = collections.defaultdict(lambda: set()) + for rel in Relation.objects.filter(schema__slug=RESPONSABILITE_LEGALE_REL): + parents[rel.right_id].add(rel.left_id) + + def same_network(first, second): + '''Returns true if persons are parts of the same family''' + if frozenset([first.id, second.id]) in conjoints: + return True + if first.id in parents and second.id in parents[first.id]: + return True + if second.id in parents and first.id in parents[second.id]: + return True + if first.id in parents and second.id in parents and parents[first.id] & parents[second.id]: + return True + return False + search = PersonSearch(limit=limit, base_limit=base_limit) count = qs.count() + seen = set() for i, first in enumerate(qs): if 'naitre' in strip_accents(first.content['prenoms'].lower()): @@ -68,8 +90,15 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count continue if first == second: continue + p = pair_sort(first.id, second.id) similarity = Decimal(second.similarity) + if same_network(first, second): + similarity *= Decimal(sibling_factor) + if similarity < limit: + continue + + seen.add(p) if p in known: duplicate = known[p] if duplicate.score == similarity: @@ -90,4 +119,6 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count yield len(new_duplicates), i + 1, count Duplicate.objects.bulk_create(new_duplicates) + # clear old duplicates + Duplicate.objects.filter(id__in=[known[p].id for p in set(known) - set(seen)]).delete() yield len(new_duplicates), count, count -- 2.24.0