0002-nanterre-diminuer-la-similarit-pour-les-membres-d-un.patch

Benjamin Dauvergne, 15 janvier 2020 23:51

Voir les différences: en ligne côte à côte

Subject: [PATCH 2/2] =?UTF-8?q?nanterre:=20diminuer=20la=20similarit=C3=A9?=
 =?UTF-8?q?=20pour=20les=20membres=20d'une=20m=C3=AAme=20famille=20(#37038?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

 tests/test_nanterre_doublons.py | 20 ++++++++++++++++++
 zoo/zoo_nanterre/duplicates.py  | 36 +++++++++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 2 deletions(-)

         call_command('rsu-duplicates', 'find')
         assert Duplicate.objects.count() < (Entity.objects.filter(schema__slug='individu').count() / 5)
         call_command('rsu-duplicates', 'list')
     def test_ignore_siblings(nanterre_classic_family, settings):
         call_command('rsu-duplicates', 'find')
         assert Duplicate.objects.count() == 0
         # by moving the birthdate of kevin to undefined (less than 1903)
         # and changing the first name of keving to JEANNOT looking like its father
         # first name JEAN, we find kevin and jean are potential duplicates
         nanterre_classic_family['kevin'].content['prenoms'] = 'JEANNOT'
         nanterre_classic_family['kevin'].content['date_de_naissance'] = '1901-01-01'
         nanterre_classic_family['kevin'].save()
         settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 1.0
         call_command('rsu-duplicates', 'find')
         assert Duplicate.objects.count() == 1
         # if we lower the sibling factor to 0.9, the duplicate is now ignored
         settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 0.9
         call_command('rsu-duplicates', 'find')
         assert Duplicate.objects.count() == 0

     # You should have received a copy of the GNU Affero General Public License
     # along with this program.  If not, see <http://www.gnu.org/licenses/>.
     import collections
     from decimal import Decimal
     import datetime
-...
     from django.conf import settings
     from zoo.utils import strip_accents
     from zoo.zoo_data.models import Entity
     from zoo.zoo_data.models import Entity, Relation
     from .models import Duplicate
     from .utils import pair_sort, PersonSearch
     from .utils import pair_sort, PersonSearch, UNION_REL, RESPONSABILITE_LEGALE_REL
     @atomic
-...
         # Define search space
         limit = limit or getattr(settings, 'ZOO_NANTERRE_DUPLICATES_THRESHOLD', 0.7)
         base_limit = base_limit or limit / 2.0
         sibling_factor = getattr(settings, 'ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR', 0.6)
         qs = queryset or Entity.objects.all()
         qs = qs.filter(schema__slug='individu')
         if days:
-...
         new = set()
         new_duplicates = []
         conjoints = set()
         for rel in Relation.objects.filter(schema__slug=UNION_REL):
             conjoints.add(frozenset([rel.left_id, rel.right_id]))
         parents = collections.defaultdict(lambda: set())
         for rel in Relation.objects.filter(schema__slug=RESPONSABILITE_LEGALE_REL):
             parents[rel.right_id].add(rel.left_id)
         def same_network(first, second):
             '''Returns true if persons are parts of the same family'''
             if frozenset([first.id, second.id]) in conjoints:
                 return True
             if first.id in parents and second.id in parents[first.id]:
                 return True
             if second.id in parents and first.id in parents[second.id]:
                 return True
             if first.id in parents and second.id in parents and parents[first.id] & parents[second.id]:
                 return True
             return False
         search = PersonSearch(limit=limit, base_limit=base_limit)
         count = qs.count()
         seen = set()
         for i, first in enumerate(qs):
             if 'naitre' in strip_accents(first.content['prenoms'].lower()):
-...
                     continue
                 if first == second:
                     continue
                 p = pair_sort(first.id, second.id)
                 similarity = Decimal(second.similarity)
                 if same_network(first, second):
                     similarity *= Decimal(sibling_factor)
                 if similarity < limit:
                     continue
                 seen.add(p)
                 if p in known:
                     duplicate = known[p]
                     if duplicate.score == similarity:
-...
                 yield len(new_duplicates), i + 1, count
         Duplicate.objects.bulk_create(new_duplicates)
         # clear old duplicates
         Duplicate.objects.filter(id__in=[known[p].id for p in set(known) - set(seen)]).delete()
         yield len(new_duplicates), count, count
+    -

Projet

Général

Profil

Produits Entr'ouvert » Zoo

0002-nanterre-diminuer-la-similarit-pour-les-membres-d-un.patch