Projet

Général

Profil

0002-nanterre-diminuer-la-similarit-pour-les-membres-d-un.patch

Benjamin Dauvergne, 15 janvier 2020 23:51

Télécharger (5,3 ko)

Voir les différences:

Subject: [PATCH 2/2] =?UTF-8?q?nanterre:=20diminuer=20la=20similarit=C3=A9?=
 =?UTF-8?q?=20pour=20les=20membres=20d'une=20m=C3=AAme=20famille=20(#37038?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

 tests/test_nanterre_doublons.py | 20 ++++++++++++++++++
 zoo/zoo_nanterre/duplicates.py  | 36 +++++++++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 2 deletions(-)
tests/test_nanterre_doublons.py
163 163
    call_command('rsu-duplicates', 'find')
164 164
    assert Duplicate.objects.count() < (Entity.objects.filter(schema__slug='individu').count() / 5)
165 165
    call_command('rsu-duplicates', 'list')
166

  
167

  
168
def test_ignore_siblings(nanterre_classic_family, settings):
169
    call_command('rsu-duplicates', 'find')
170
    assert Duplicate.objects.count() == 0
171

  
172
    # by moving the birthdate of kevin to undefined (less than 1903)
173
    # and changing the first name of keving to JEANNOT looking like its father
174
    # first name JEAN, we find kevin and jean are potential duplicates
175
    nanterre_classic_family['kevin'].content['prenoms'] = 'JEANNOT'
176
    nanterre_classic_family['kevin'].content['date_de_naissance'] = '1901-01-01'
177
    nanterre_classic_family['kevin'].save()
178
    settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 1.0
179
    call_command('rsu-duplicates', 'find')
180
    assert Duplicate.objects.count() == 1
181

  
182
    # if we lower the sibling factor to 0.9, the duplicate is now ignored
183
    settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 0.9
184
    call_command('rsu-duplicates', 'find')
185
    assert Duplicate.objects.count() == 0
zoo/zoo_nanterre/duplicates.py
14 14
# You should have received a copy of the GNU Affero General Public License
15 15
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 16

  
17
import collections
17 18
from decimal import Decimal
18 19
import datetime
19 20

  
......
23 24
from django.conf import settings
24 25

  
25 26
from zoo.utils import strip_accents
26
from zoo.zoo_data.models import Entity
27
from zoo.zoo_data.models import Entity, Relation
27 28
from .models import Duplicate
28
from .utils import pair_sort, PersonSearch
29
from .utils import pair_sort, PersonSearch, UNION_REL, RESPONSABILITE_LEGALE_REL
29 30

  
30 31

  
31 32
@atomic
......
34 35
    # Define search space
35 36
    limit = limit or getattr(settings, 'ZOO_NANTERRE_DUPLICATES_THRESHOLD', 0.7)
36 37
    base_limit = base_limit or limit / 2.0
38
    sibling_factor = getattr(settings, 'ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR', 0.6)
39

  
37 40
    qs = queryset or Entity.objects.all()
38 41
    qs = qs.filter(schema__slug='individu')
39 42
    if days:
......
49 52
    new = set()
50 53
    new_duplicates = []
51 54

  
55
    conjoints = set()
56
    for rel in Relation.objects.filter(schema__slug=UNION_REL):
57
        conjoints.add(frozenset([rel.left_id, rel.right_id]))
58
    parents = collections.defaultdict(lambda: set())
59
    for rel in Relation.objects.filter(schema__slug=RESPONSABILITE_LEGALE_REL):
60
        parents[rel.right_id].add(rel.left_id)
61

  
62
    def same_network(first, second):
63
        '''Returns true if persons are parts of the same family'''
64
        if frozenset([first.id, second.id]) in conjoints:
65
            return True
66
        if first.id in parents and second.id in parents[first.id]:
67
            return True
68
        if second.id in parents and first.id in parents[second.id]:
69
            return True
70
        if first.id in parents and second.id in parents and parents[first.id] & parents[second.id]:
71
            return True
72
        return False
73

  
52 74
    search = PersonSearch(limit=limit, base_limit=base_limit)
53 75
    count = qs.count()
76
    seen = set()
54 77

  
55 78
    for i, first in enumerate(qs):
56 79
        if 'naitre' in strip_accents(first.content['prenoms'].lower()):
......
68 91
                continue
69 92
            if first == second:
70 93
                continue
94

  
71 95
            p = pair_sort(first.id, second.id)
72 96
            similarity = Decimal(second.similarity)
97
            if same_network(first, second):
98
                similarity *= Decimal(sibling_factor)
99
            if similarity < limit:
100
                continue
101

  
102
            seen.add(p)
73 103
            if p in known:
74 104
                duplicate = known[p]
75 105
                if duplicate.score == similarity:
......
90 120
            yield len(new_duplicates), i + 1, count
91 121

  
92 122
    Duplicate.objects.bulk_create(new_duplicates)
123
    # clear old duplicates
124
    Duplicate.objects.filter(id__in=[known[p].id for p in set(known) - set(seen)]).delete()
93 125
    yield len(new_duplicates), count, count
94
-