From 73e8a032d0ffeec2dfbe185a1f9bfb285d0c6885 Mon Sep 17 00:00:00 2001
From: Benjamin Dauvergne <bdauvergne@entrouvert.com>
Date: Sun, 12 Jan 2020 21:57:22 +0100
Subject: [PATCH 2/2] =?UTF-8?q?nanterre:=20diminuer=20la=20similarit=C3=A9?=
 =?UTF-8?q?=20pour=20les=20membres=20d'une=20m=C3=AAme=20famille=20(#37038?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 zoo/zoo_nanterre/duplicates.py | 35 ++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/zoo/zoo_nanterre/duplicates.py b/zoo/zoo_nanterre/duplicates.py
index 23a73ff..5d01abb 100644
--- a/zoo/zoo_nanterre/duplicates.py
+++ b/zoo/zoo_nanterre/duplicates.py
@@ -14,6 +14,7 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
+import collections
 from decimal import Decimal
 import datetime
 
@@ -23,15 +24,16 @@ from django.utils.timezone import now
 from django.conf import settings
 
 from zoo.utils import strip_accents
-from zoo.zoo_data.models import Entity
+from zoo.zoo_data.models import Entity, Relation
 from .models import Duplicate
-from .utils import pair_sort, PersonSearch
+from .utils import pair_sort, PersonSearch, UNION_REL, RESPONSABILITE_LEGALE_REL
 
 
 @atomic
 def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count=None, ids=None,
                     progression=False):
     # Define search space
+    sibling_factor = limit or getattr(settings, 'ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR', 0.6)
     limit = limit or getattr(settings, 'ZOO_NANTERRE_DUPLICATES_THRESHOLD', 0.7)
     base_limit = base_limit or limit / 2.0
     qs = queryset or Entity.objects.all()
@@ -49,8 +51,28 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count
     new = set()
     new_duplicates = []
 
+    conjoints = set()
+    for rel in Relation.objects.filter(schema__slug=UNION_REL):
+        conjoints .add(frozenset([rel.left_id, rel.right_id]))
+    parents = collections.defaultdict(lambda: set())
+    for rel in Relation.objects.filter(schema__slug=RESPONSABILITE_LEGALE_REL):
+        parents[rel.right_id].add(rel.left_id)
+
+    def same_network(first, second):
+        '''Returns true if persons are parts of the same family'''
+        if frozenset([first.id, second.id]) in conjoints:
+            return True
+        if first.id in parents and second.id in parents[first.id]:
+            return True
+        if second.id in parents and first.id in parents[second.id]:
+            return True
+        if first.id in parents and second.id in parents and parents[first.id] & parents[second.id]:
+            return True
+        return False
+
     search = PersonSearch(limit=limit, base_limit=base_limit)
     count = qs.count()
+    seen = set()
 
     for i, first in enumerate(qs):
         if 'naitre' in strip_accents(first.content['prenoms'].lower()):
@@ -68,8 +90,15 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count
                 continue
             if first == second:
                 continue
+
             p = pair_sort(first.id, second.id)
             similarity = Decimal(second.similarity)
+            if same_network(first, second):
+                similarity *= Decimal(sibling_factor)
+            if similarity < limit:
+                continue
+
+            seen.add(p)
             if p in known:
                 duplicate = known[p]
                 if duplicate.score == similarity:
@@ -90,4 +119,6 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count
             yield len(new_duplicates), i + 1, count
 
     Duplicate.objects.bulk_create(new_duplicates)
+    # clear old duplicates
+    Duplicate.objects.filter(id__in=[known[p].id for p in set(known) - set(seen)])
     yield len(new_duplicates), count, count
-- 
2.24.0