14 |
14 |
# You should have received a copy of the GNU Affero General Public License
|
15 |
15 |
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
16 |
16 |
|
|
17 |
import collections
|
17 |
18 |
from decimal import Decimal
|
18 |
19 |
import datetime
|
19 |
20 |
|
... | ... | |
23 |
24 |
from django.conf import settings
|
24 |
25 |
|
25 |
26 |
from zoo.utils import strip_accents
|
26 |
|
from zoo.zoo_data.models import Entity
|
|
27 |
from zoo.zoo_data.models import Entity, Relation
|
27 |
28 |
from .models import Duplicate
|
28 |
|
from .utils import pair_sort, PersonSearch
|
|
29 |
from .utils import pair_sort, PersonSearch, UNION_REL, RESPONSABILITE_LEGALE_REL
|
29 |
30 |
|
30 |
31 |
|
31 |
32 |
@atomic
|
32 |
33 |
def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count=None, ids=None,
|
33 |
34 |
progression=False):
|
34 |
35 |
# Define search space
|
|
36 |
sibling_factor = limit or getattr(settings, 'ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR', 0.6)
|
35 |
37 |
limit = limit or getattr(settings, 'ZOO_NANTERRE_DUPLICATES_THRESHOLD', 0.7)
|
36 |
38 |
base_limit = base_limit or limit / 2.0
|
37 |
39 |
qs = queryset or Entity.objects.all()
|
... | ... | |
49 |
51 |
new = set()
|
50 |
52 |
new_duplicates = []
|
51 |
53 |
|
|
54 |
conjoints = set()
|
|
55 |
for rel in Relation.objects.filter(schema__slug=UNION_REL):
|
|
56 |
conjoints .add(frozenset([rel.left_id, rel.right_id]))
|
|
57 |
parents = collections.defaultdict(lambda: set())
|
|
58 |
for rel in Relation.objects.filter(schema__slug=RESPONSABILITE_LEGALE_REL):
|
|
59 |
parents[rel.right_id].add(rel.left_id)
|
|
60 |
|
|
61 |
def same_network(first, second):
|
|
62 |
'''Returns true if persons are parts of the same family'''
|
|
63 |
if frozenset([first.id, second.id]) in conjoints:
|
|
64 |
return True
|
|
65 |
if first.id in parents and second.id in parents[first.id]:
|
|
66 |
return True
|
|
67 |
if second.id in parents and first.id in parents[second.id]:
|
|
68 |
return True
|
|
69 |
if first.id in parents and second.id in parents and parents[first.id] & parents[second.id]:
|
|
70 |
return True
|
|
71 |
return False
|
|
72 |
|
52 |
73 |
search = PersonSearch(limit=limit, base_limit=base_limit)
|
53 |
74 |
count = qs.count()
|
|
75 |
seen = set()
|
54 |
76 |
|
55 |
77 |
for i, first in enumerate(qs):
|
56 |
78 |
if 'naitre' in strip_accents(first.content['prenoms'].lower()):
|
... | ... | |
68 |
90 |
continue
|
69 |
91 |
if first == second:
|
70 |
92 |
continue
|
|
93 |
|
71 |
94 |
p = pair_sort(first.id, second.id)
|
72 |
95 |
similarity = Decimal(second.similarity)
|
|
96 |
if same_network(first, second):
|
|
97 |
similarity *= Decimal(sibling_factor)
|
|
98 |
if similarity < limit:
|
|
99 |
continue
|
|
100 |
|
|
101 |
seen.add(p)
|
73 |
102 |
if p in known:
|
74 |
103 |
duplicate = known[p]
|
75 |
104 |
if duplicate.score == similarity:
|
... | ... | |
90 |
119 |
yield len(new_duplicates), i + 1, count
|
91 |
120 |
|
92 |
121 |
Duplicate.objects.bulk_create(new_duplicates)
|
|
122 |
# clear old duplicates
|
|
123 |
Duplicate.objects.filter(id__in=[known[p].id for p in set(known) - set(seen)]).delete()
|
93 |
124 |
yield len(new_duplicates), count, count
|
94 |
|
-
|