Project

General

Profile

0001-general-redo-full-text-search-using-querysets-33632.patch

Frédéric Péters, 22 January 2020 10:31 AM

Download (39.3 KB)

View differences:

Subject: [PATCH] general: redo full text search using querysets (#33632)

 combo/apps/search/__init__.py                 |  19 ++
 combo/apps/search/management/__init__.py      |   0
 .../search/management/commands/__init__.py    |   0
 .../management/commands/update_index.py       |  78 -------
 .../search/migrations/0006_indexedcell.py     |  35 ++++
 combo/apps/search/models.py                   |  78 ++++---
 combo/apps/search/utils.py                    | 111 ++++++++++
 combo/data/apps.py                            |  15 --
 .../0041_delete_externallinksearchitem.py     |  18 ++
 combo/data/models.py                          |  16 --
 combo/data/search_indexes.py                  |  46 -----
 combo/data/templates/combo/search/page.txt    |   7 -
 combo/public/urls.py                          |   1 -
 combo/public/views.py                         |  28 ---
 combo/settings.py                             |   9 +-
 debian/combo.cron.hourly                      |   2 -
 debian/control                                |   3 +-
 requirements.txt                              |   2 -
 setup.py                                      |   2 -
 tests/settings.py                             |   3 -
 tests/test_search.py                          | 195 +++++++++++-------
 21 files changed, 358 insertions(+), 310 deletions(-)
 delete mode 100644 combo/apps/search/management/__init__.py
 delete mode 100644 combo/apps/search/management/commands/__init__.py
 delete mode 100644 combo/apps/search/management/commands/update_index.py
 create mode 100644 combo/apps/search/migrations/0006_indexedcell.py
 create mode 100644 combo/apps/search/utils.py
 create mode 100644 combo/data/migrations/0041_delete_externallinksearchitem.py
 delete mode 100644 combo/data/search_indexes.py
 delete mode 100644 combo/data/templates/combo/search/page.txt
combo/apps/search/__init__.py
15 15
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 16

  
17 17
import django.apps
18
from django.core.urlresolvers import reverse
18 19
from django.utils.translation import ugettext_lazy as _
19 20

  
20 21
from .engines import engines
......
28 29
        from . import urls
29 30
        return urls.urlpatterns
30 31

  
32
    def hourly(self):
33
        from .utils import index_site
34
        index_site()
35

  
36
    def ready(self):
37
        # register built-in search engine for page contents
38
        engines.register(self.get_search_engines)
39

  
40
    def get_search_engines(self):
41
        from .utils import search_site
42
        return {
43
            '_text': {
44
                'function': search_site,
45
                'label': _('Page Contents'),
46
            }
47
        }
48

  
49

  
31 50
default_app_config = 'combo.apps.search.AppConfig'
combo/apps/search/management/commands/update_index.py
1
# combo - content management system
2
# Copyright (C) 2017  Entr'ouvert
3
#
4
# This program is free software: you can redistribute it and/or modify it
5
# under the terms of the GNU Affero General Public License as published
6
# by the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
# GNU Affero General Public License for more details.
13
#
14
# You should have received a copy of the GNU Affero General Public License
15
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
16

  
17
from django.utils.timezone import now
18

  
19
from haystack.management.commands.update_index import Command as UpdateIndexCommand
20

  
21
from combo.data.models import Page, ExternalLinkSearchItem
22
from combo.apps.search.models import SearchCell
23

  
24

  
25
class Command(UpdateIndexCommand):
26

  
27
    def add_arguments(self, parser):
28
        super(Command, self).add_arguments(parser)
29
        parser.add_argument(
30
            '--skip-external-links-collection', action='store_true', default=False,
31
            dest='skip_external_links_collection')
32

  
33
    def handle(self, **options):
34
        if not any(SearchCell.get_cells_by_search_service('_text')):
35
            # do not index site if there's no matching search cell
36
            return
37
        if not options.get('skip_external_links_collection', False):
38
            self.collect_external_links(options)
39
        return super(Command, self).handle(**options)
40

  
41
    def collect_external_links(self, options):
42
        start_time = now()
43

  
44
        if options.get('remove'):
45
            ExternalLinkSearchItem.objects.all().delete()
46

  
47
        # assemble external links data
48
        links = {}
49
        for page in Page.objects.filter(sub_slug=''):
50
            if not page.is_visible(user=None):
51
                continue
52
            for cell in page.get_cells():
53
                if not cell.is_visible(user=None):
54
                    continue
55
                for link_data in cell.get_external_links_data():
56
                    if not link_data['url'] in links:
57
                        # create an entry for that link.
58
                        links[link_data['url']] = {}
59
                        links[link_data['url']]['title'] = link_data['title']
60
                        links[link_data['url']]['all_texts'] = []
61
                    else:
62
                        # if that link already exists, just keep the title as
63
                        # text.
64
                        links[link_data['url']]['all_texts'].append(link_data['title'])
65
                    # additional texts will be assembled and indexed
66
                    links[link_data['url']]['all_texts'].append(link_data.get('text') or '')
67

  
68
        # save data as ExternalLinkSearchItem objects
69
        for link_url, link_data in links.items():
70
            link_object, created = ExternalLinkSearchItem.objects.get_or_create(
71
                    url=link_url,
72
                    defaults={'title': link_data['title']})
73
            link_object.title = link_data['title']
74
            link_object.text = '\n'.join(link_data['all_texts'])
75
            link_object.save()
76

  
77
        # remove obsolete objects
78
        ExternalLinkSearchItem.objects.filter(last_update_timestamp__lt=start_time).delete()
combo/apps/search/migrations/0006_indexedcell.py
1
# -*- coding: utf-8 -*-
2
# Generated by Django 1.11.17 on 2020-01-20 15:30
3
from __future__ import unicode_literals
4

  
5
from django.db import migrations, models
6
import django.db.models.deletion
7

  
8

  
9
class Migration(migrations.Migration):
10

  
11
    dependencies = [
12
        ('data', '0041_delete_externallinksearchitem'),
13
        ('auth', '0008_alter_user_username_max_length'),
14
        ('contenttypes', '0002_remove_content_type_name'),
15
        ('search', '0005_searchcell_autofocus'),
16
    ]
17

  
18
    operations = [
19
        migrations.CreateModel(
20
            name='IndexedCell',
21
            fields=[
22
                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
23
                ('cell_pk', models.PositiveIntegerField(null=True)),
24
                ('url', models.CharField(blank=True, max_length=500, null=True)),
25
                ('title', models.CharField(blank=True, max_length=500, null=True)),
26
                ('indexed_text', models.TextField(blank=True, null=True)),
27
                ('public_access', models.BooleanField(default=False)),
28
                ('last_update_timestamp', models.DateTimeField(auto_now=True)),
29
                ('cell_type', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='contenttypes.ContentType')),
30
                ('excluded_groups', models.ManyToManyField(blank=True, related_name='_indexedcell_excluded_groups_+', to='auth.Group')),
31
                ('page', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='data.Page')),
32
                ('restricted_groups', models.ManyToManyField(blank=True, related_name='_indexedcell_restricted_groups_+', to='auth.Group')),
33
            ],
34
        ),
35
    ]
combo/apps/search/models.py
16 16

  
17 17
import os
18 18

  
19
from django.conf import settings
19
from django.contrib.auth.models import Group
20
from django.contrib.contenttypes import fields
21
from django.contrib.contenttypes.models import ContentType
20 22
from django.db import models
21 23
from django.utils.translation import ugettext_lazy as _
22 24
from django import template
23 25
from django.http import HttpResponse
24 26
from django.core.exceptions import PermissionDenied
25
from django.core.urlresolvers import reverse
26 27
from django.utils.http import quote
27 28
from django.template import RequestContext, Template
28 29

  
29 30
from jsonfield import JSONField
30
from haystack import connections
31 31

  
32 32
from combo.utils import requests
33
from combo.data.models import CellBase
33
from combo.data.models import CellBase, Page
34 34
from combo.data.library import register_cell_class
35 35
from combo.utils import get_templated_url
36 36

  
......
69 69
        services = []
70 70
        for service_slug in self._search_services.get('data') or []:
71 71
            service = engines.get(service_slug)
72
            if service and service.get('url'):
72
            if service and (service.get('url') or service.get('function')):
73 73
                service['slug'] = service_slug
74 74
                services.append(service)
75 75
        return services
......
141 141
        if not query:
142 142
            return render_response(service)
143 143

  
144
        url = get_templated_url(service['url'],
145
                context={'request': request, 'q': query, 'search_service': service})
146
        url = url % {'q': quote(query.encode('utf-8'))}  # if url contains %(q)s
147
        if url.startswith('/'):
148
            url = request.build_absolute_uri(url)
149

  
150
        if not url:
151
            return render_response(service)
152

  
153
        kwargs = {}
154
        kwargs['cache_duration'] = service.get('cache_duration', 0)
155
        kwargs['remote_service'] = 'auto' if service.get('signature') else None
156
        # don't automatically add user info to query string, if required it can
157
        # be set explicitely in the URL template in the engine definition (via
158
        # {{user_nameid}} or {{user_email}}).
159
        kwargs['without_user'] = True
160
        # don't send error traces on HTTP errors
161
        kwargs['log_errors'] = 'warn'
162

  
163
        response = requests.get(url, **kwargs)
164
        try:
165
            results = response.json()
166
        except ValueError:
167
            return render_response(service)
144
        if service.get('function'):  # internal search engine
145
            results = {'data': service['function'](request, query)}
146
        else:
147
            url = get_templated_url(service['url'],
148
                    context={'request': request, 'q': query, 'search_service': service})
149
            url = url % {'q': quote(query.encode('utf-8'))}  # if url contains %(q)s
150
            if url.startswith('/'):
151
                url = request.build_absolute_uri(url)
152

  
153
            if not url:
154
                return render_response(service)
155

  
156
            kwargs = {}
157
            kwargs['cache_duration'] = service.get('cache_duration', 0)
158
            kwargs['remote_service'] = 'auto' if service.get('signature') else None
159
            # don't automatically add user info to query string, if required it can
160
            # be set explicitely in the URL template in the engine definition (via
161
            # {{user_nameid}} or {{user_email}}).
162
            kwargs['without_user'] = True
163
            # don't send error traces on HTTP errors
164
            kwargs['log_errors'] = 'warn'
165

  
166
            response = requests.get(url, **kwargs)
167
            try:
168
                results = response.json()
169
            except ValueError:
170
                return render_response(service)
168 171

  
169 172
        if service.get('data_key'):
170 173
            results['data'] = results.get(service['data_key']) or []
......
179 182
            for hit in results.get('data') or []:
180 183
                for k, v in hit_templates.items():
181 184
                    hit[k] = v.render(RequestContext(request, hit))
185

  
182 186
        return render_response(service, results)
183 187

  
184 188
    def has_text_search_service(self):
185 189
        return '_text' in self._search_services.get('data', [])
186 190

  
187 191
    def missing_index(self):
188
        return not os.path.exists(connections['default'].get_backend().path)
192
        return IndexedCell.objects.all().count() == 0
193

  
194

  
195
class IndexedCell(models.Model):
196
    cell_type = models.ForeignKey(ContentType, on_delete=models.CASCADE)
197
    cell_pk = models.PositiveIntegerField(null=True)
198
    cell = fields.GenericForeignKey('cell_type', 'cell_pk')
199
    page = models.ForeignKey(Page, on_delete=models.CASCADE, blank=True, null=True)
200
    url = models.CharField(max_length=500, blank=True, null=True)
201
    title = models.CharField(max_length=500, blank=True, null=True)
202
    indexed_text = models.TextField(blank=True, null=True)
203
    public_access = models.BooleanField(default=False)
204
    restricted_groups = models.ManyToManyField(Group, blank=True, related_name='+')
205
    excluded_groups = models.ManyToManyField(Group, blank=True, related_name='+')
206
    last_update_timestamp = models.DateTimeField(auto_now=True)
combo/apps/search/utils.py
1
# combo - content management system
2
# Copyright (C) 2014-2020  Entr'ouvert
3
#
4
# This program is free software: you can redistribute it and/or modify it
5
# under the terms of the GNU Affero General Public License as published
6
# by the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
# GNU Affero General Public License for more details.
13
#
14
# You should have received a copy of the GNU Affero General Public License
15
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
16

  
17
from django.conf import settings
18
from django.contrib.contenttypes.models import ContentType
19
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
20
from combo.data.models import CellBase
21
from django.db import connection
22
from django.db.models import Q
23
from django.db.transaction import atomic
24

  
25
from .models import IndexedCell
26

  
27

  
28
def set_cell_access(indexed_cell, cell):
29
    indexed_cell.public_access = bool(cell.page.public and cell.public)
30
    indexed_cell.excluded_groups.clear()
31
    indexed_cell.restricted_groups.clear()
32
    if not indexed_cell.public_access:
33
        indexed_cell.restricted_groups.set(cell.groups.all())
34
        if cell.restricted_to_unlogged:
35
            indexed_cell.excluded_groups.set(cell.page.groups.all())
36
        else:
37
            for group in cell.page.groups.all():
38
                indexed_cell.restricted_groups.add(group)
39
    indexed_cell.save()
40

  
41

  
42
@atomic
43
def index_site():
44
    IndexedCell.objects.all().delete()
45
    external_urls = {}
46
    for klass in CellBase.get_cell_classes():
47
        for cell in klass.objects.filter(page__snapshot__isnull=True).exclude(placeholder__startswith='_'):
48
            cell_type = ContentType.objects.get_for_model(cell)
49
            indexed_cell = IndexedCell(cell_type=cell_type, cell_pk=cell.id)
50
            try:
51
                indexed_cell.indexed_text = cell.render_for_search()
52
            except Exception:  # ignore rendering error
53
                continue
54
            if indexed_cell.indexed_text:
55
                indexed_cell.page_id = cell.page_id
56
                indexed_cell.url = cell.page.get_online_url()
57
                indexed_cell.title = cell.page.title
58
                indexed_cell.save()
59
                set_cell_access(indexed_cell, cell)
60

  
61
            for link_data in cell.get_external_links_data():
62
                # index external links
63
                indexed_cell = external_urls.get(indexed_cell.url)
64
                if indexed_cell is None:
65
                    # create an entry for that link.
66
                    indexed_cell = IndexedCell(cell_type=cell_type, cell_pk=cell.id)
67
                    indexed_cell.save()
68
                    set_cell_access(indexed_cell, cell)
69
                    indexed_cell.url = link_data['url']
70
                    indexed_cell.title = link_data['title']
71
                    indexed_cell.indexed_text = link_data.get('text') or ''
72
                    external_urls[indexed_cell.url] = indexed_cell
73
                else:
74
                    # if that link already exists, add detailed texts
75
                    indexed_cell.indexed_text += ' ' + link_data['title']
76
                    indexed_cell.indexed_text += ' ' + link_data.get('text') or ''
77
                indexed_cell.save()
78

  
79

  
80
def search_site(request, query):
81
    if connection.vendor == 'postgresql':
82
        config = settings.POSTGRESQL_FTS_SEARCH_CONFIG
83
        vector = SearchVector('title', config=config, weight='A') + SearchVector('indexed_text', config=config, weight='A')
84
        query = SearchQuery(query)
85
        qs = IndexedCell.objects.annotate(rank=SearchRank(vector, query)).filter(rank__gte=0.3).order_by('-rank')
86
    else:
87
        qs = IndexedCell.objects.filter(
88
                Q(indexed_text__icontains=query) | Q(title__icontains=query))
89
    if request.user.is_anonymous:
90
        qs = qs.exclude(public_access=False)
91
    else:
92
        qs = qs.filter(
93
                Q(restricted_groups=None) |
94
                Q(restricted_groups__in=request.user.groups.all()))
95
        qs = qs.exclude(excluded_groups__in=request.user.groups.all())
96

  
97
    hits = []
98
    seen = {}
99
    for hit in qs:
100
        if hit.url in seen:
101
            continue
102
        hits.append({
103
            'text': hit.title,
104
            'rank': getattr(hit, 'rank', None),
105
            'url': hit.url,
106
        })
107
        seen[hit.url] = True
108
        if len(hits) == 10:
109
            break
110

  
111
    return hits
combo/data/apps.py
15 15
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 16

  
17 17
from django.apps import AppConfig
18
from django.core.urlresolvers import reverse
19
from django.utils.translation import ugettext_lazy as _
20 18

  
21 19

  
22 20
class DataConfig(AppConfig):
23 21
    name = 'combo.data'
24 22
    verbose_name = 'data'
25

  
26
    def ready(self):
27
        # register built-in search engine for page contents
28
        from combo.apps.search import engines
29
        engines.register(self.get_search_engines)
30

  
31
    def get_search_engines(self):
32
        return {
33
            '_text': {
34
                'url': reverse('api-search') + '?q=%(q)s',
35
                'label': _('Page Contents'),
36
            }
37
        }
combo/data/migrations/0041_delete_externallinksearchitem.py
1
# -*- coding: utf-8 -*-
2
# Generated by Django 1.11.17 on 2020-01-20 15:30
3
from __future__ import unicode_literals
4

  
5
from django.db import migrations
6

  
7

  
8
class Migration(migrations.Migration):
9

  
10
    dependencies = [
11
        ('data', '0040_auto_20200119_1017'),
12
    ]
13

  
14
    operations = [
15
        migrations.DeleteModel(
16
            name='ExternalLinkSearchItem',
17
        ),
18
    ]
combo/data/models.py
729 729
            return ''
730 730
        if self.user_dependant:
731 731
            return ''
732
        if not self.page.is_visible(user=None):
733
            return ''
734
        if not self.is_visible(user=None):
735
            return ''
736 732
        request = RequestFactory().get(self.page.get_online_url())
737 733
        request.user = None  # compat
738 734
        context = {
......
1447 1443
        return context
1448 1444

  
1449 1445

  
1450
class ExternalLinkSearchItem(models.Model):
1451
    # Link to an external site.
1452
    #
1453
    # Those are automatically collected during by the "update_index" command,
1454
    # that calls get_external_links_data from all available cells, to be used
1455
    # by the general search engine.
1456
    title = models.CharField(_('Title'), max_length=150)
1457
    text = models.TextField(blank=True)
1458
    url = models.CharField(_('URL'), max_length=200, blank=True)
1459
    last_update_timestamp = models.DateTimeField(auto_now=True)
1460

  
1461

  
1462 1446
@receiver(pre_save, sender=Page)
1463 1447
def create_redirects(sender, instance, raw, **kwargs):
1464 1448
    if raw or not instance.id or instance.snapshot_id:
combo/data/search_indexes.py
1
# combo - content management system
2
# Copyright (C) 2014-2017  Entr'ouvert
3
#
4
# This program is free software: you can redistribute it and/or modify it
5
# under the terms of the GNU Affero General Public License as published
6
# by the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
# GNU Affero General Public License for more details.
13
#
14
# You should have received a copy of the GNU Affero General Public License
15
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
16

  
17
from haystack import indexes
18
from haystack.exceptions import SkipDocument
19

  
20
from .models import Page, CellBase, ExternalLinkSearchItem
21

  
22
class PageIndex(indexes.SearchIndex, indexes.Indexable):
23
    title = indexes.CharField(model_attr='title', boost=1.5)
24
    text = indexes.CharField(document=True, use_template=True,
25
            template_name='combo/search/page.txt')
26
    url = indexes.CharField(indexed=False)
27

  
28
    def get_model(self):
29
        return Page
30

  
31
    def prepare_url(self, obj):
32
        return obj.get_online_url()
33

  
34
    def prepare(self, obj):
35
        if not obj.is_visible(user=None):
36
            raise SkipDocument()
37
        return super(PageIndex, self).prepare(obj)
38

  
39

  
40
class ExternalLinkSearchIndex(indexes.SearchIndex, indexes.Indexable):
41
    title = indexes.CharField(model_attr='title', boost=1.5)
42
    text = indexes.CharField(model_attr='text', document=True)
43
    url = indexes.CharField(model_attr='url', indexed=False)
44

  
45
    def get_model(self):
46
        return ExternalLinkSearchItem
combo/data/templates/combo/search/page.txt
1
{% autoescape off %}
2
{% for cell in object.get_cells %}
3
 {% if cell.placeholder|first != '_' %} {# ignore technical placeholders #}
4
  {{ cell.render_for_search }}
5
 {% endif %}
6
{% endfor %}
7
{% endautoescape %}
combo/public/urls.py
21 21

  
22 22
urlpatterns = [
23 23
    url(r'^api/menu-badges/$', views.menu_badges),
24
    url(r'^api/search/$', views.api_search, name='api-search'),
25 24
    url(r'^ajax/cell/(?P<page_pk>\w+)/(?P<cell_reference>[\w_-]+)/$',
26 25
        views.ajax_page_cell, name='combo-public-ajax-page-cell'),
27 26
    url(r'^snapshot/(?P<pk>\w+)/$', manager_required(views.snapshot), name='combo-snapshot-view'),
combo/public/views.py
40 40
from django.utils.translation import ugettext as _
41 41
from django.forms.widgets import Media
42 42

  
43
from haystack.inputs import AutoQuery
44
from haystack.query import SearchQuerySet, SQ
45

  
46 43
if 'mellon' in settings.INSTALLED_APPS:
47 44
    from mellon.utils import get_idps
48 45
else:
......
552 549
menu_badges.mellon_no_passive = True
553 550

  
554 551

  
555
def api_search(request):
556
    for cell in SearchCell.get_cells_by_search_service('_text'):
557
        if not cell.is_visible(request.user):
558
            continue
559
        break
560
    else:
561
        raise Http404()
562
    query = request.GET.get('q') or ''
563
    sqs = SearchQuerySet().filter(SQ(content=AutoQuery(query)) | SQ(title=AutoQuery(query)))
564
    sqs = sqs.highlight()
565
    sqs.load_all()
566
    hits = []
567
    for hit in sqs:
568
        description = None
569
        if hit.model_name == 'page' and hit.highlighted['text']:
570
            description = '<p>%s</p>' % hit.highlighted['text'][0]
571
        hits.append({
572
            'text': hit.title,
573
            'url': hit.url,
574
            'description': description,
575
        })
576

  
577
    return HttpResponse(json.dumps({'data': hits}), content_type='application/json')
578

  
579

  
580 552
def snapshot(request, *args, **kwargs):
581 553
    snapshot = PageSnapshot.objects.get(id=kwargs['pk'])
582 554
    return publish_page(request, snapshot.get_page())
combo/settings.py
76 76
    'combo.apps.calendar',
77 77
    'combo.apps.pwa',
78 78
    'combo.apps.gallery',
79
    'haystack',
80 79
    'xstatic.pkg.josefinsans',
81 80
    'xstatic.pkg.leaflet',
82 81
    'xstatic.pkg.opensans',
......
187 186
CKEDITOR_CONFIGS['small'] = copy.copy(CKEDITOR_CONFIGS['default'])
188 187
CKEDITOR_CONFIGS['small']['height'] = 150
189 188

  
190
HAYSTACK_CONNECTIONS = {
191
    'default': {
192
        'ENGINE': 'haystack.backends.whoosh_backend.WhooshEngine',
193
        'PATH': os.path.join(BASE_DIR, 'whoosh_index'),
194
    },
195
}
196

  
197 189
# from solr.thumbnail -- https://sorl-thumbnail.readthedocs.io/en/latest/reference/settings.html
198 190
THUMBNAIL_PRESERVE_FORMAT = True
199 191
THUMBNAIL_FORCE_OVERWRITE = False
......
262 254

  
263 255
# search services
264 256
COMBO_SEARCH_SERVICES = {}
257
POSTGRESQL_FTS_SEARCH_CONFIG = 'french'
265 258

  
266 259
# mapping of payment modes
267 260
LINGO_NO_ONLINE_PAYMENT_REASONS = {}
debian/combo.cron.hourly
2 2

  
3 3
/sbin/runuser -u combo /usr/bin/combo-manage -- tenant_command cron --all-tenants
4 4
/sbin/runuser -u combo /usr/bin/combo-manage -- tenant_command clearsessions --all-tenants
5
# update_index cannot be used due to some bug in haystack/whoosh (#30509)
6
/sbin/runuser -u combo /usr/bin/combo-manage -- tenant_command rebuild_index --noinput --all-tenants -v0
debian/control
20 20
    python3-xstatic-opensans,
21 21
    python3-xstatic-roboto-fontface (>= 0.5.0.0),
22 22
    python3-eopayment (>= 1.35),
23
    python3-django-haystack (>= 2.4.0),
24 23
    python3-django-ratelimit,
25 24
    python3-sorl-thumbnail,
26 25
    python3-pil,
27 26
    python3-pywebpush,
28 27
    python3-pygal,
29 28
    python3-lxml
30
Recommends: python3-django-mellon, python3-whoosh
29
Recommends: python3-django-mellon
31 30
Conflicts: python-lingo
32 31
Breaks: combo (<< 2.34.post2)
33 32
Description: Portal Management System (Python module)
requirements.txt
11 11
eopayment>=1.13
12 12
python-dateutil
13 13
djangorestframework>=3.3, <3.7
14
django-haystack
15
whoosh
16 14
sorl-thumbnail
17 15
pyproj
setup.py
162 162
        'eopayment>=1.41',
163 163
        'python-dateutil',
164 164
        'djangorestframework>=3.3, <3.7',
165
        'django-haystack',
166 165
        'django-ratelimit<3',
167
        'whoosh',
168 166
        'sorl-thumbnail',
169 167
        'Pillow',
170 168
        'pyproj',
tests/settings.py
44 44
import tempfile
45 45
MEDIA_ROOT = tempfile.mkdtemp('combo-test')
46 46

  
47
HAYSTACK_CONNECTIONS['default']['PATH'] = os.path.join(
48
        tempfile.mkdtemp('combo-test-whoosh'))
49

  
50 47
if 'DISABLE_MIGRATIONS' in os.environ:
51 48
    class DisableMigrations(object):
52 49
        def __contains__(self, item):
tests/test_search.py
6 6
import mock
7 7

  
8 8
from django.conf import settings
9
from django.contrib.auth.models import AnonymousUser, User, Group
9 10
from django.test import override_settings
10 11
from django.test.client import RequestFactory
11 12
from django.core.management import call_command
12 13
from django.core.urlresolvers import reverse
13 14

  
14
from haystack.exceptions import SkipDocument
15

  
16 15
from combo.apps.search.engines import engines
17
from combo.apps.search.models import SearchCell
16
from combo.apps.search.models import SearchCell, IndexedCell
17
from combo.apps.search.utils import index_site, search_site
18 18
from combo.data.models import Page, JsonCell, TextCell, MenuCell, LinkCell
19
from combo.data.search_indexes import PageIndex
20 19

  
21 20
from .test_manager import login
22 21

  
......
229 228
    page = Page(title='example page', slug='example-page')
230 229
    page.save()
231 230

  
232
    # no indexation of private cells (is_visible check)
231
    # private cells are indexed
233 232
    cell = TextCell(page=page, text='foobar', public=False, order=0)
234
    assert cell.render_for_search() == ''
233
    assert cell.render_for_search().strip() == 'foobar'
235 234

  
236 235
    # no indexation of empty cells (is_relevant check)
237 236
    cell = TextCell(page=page, text='', order=0)
......
247 246

  
248 247
def test_search_contents_index():
249 248
    page = Page(title='example page', slug='example-page')
249
    page.public = True
250 250
    page.save()
251 251

  
252
    page_index = PageIndex()
253
    assert page_index.get_model() is Page
254

  
255
    assert page_index.prepare_url(page) == '/example-page/'
256

  
257
    page_index.prepare(page)
258

  
259
    page.public = False
260
    with pytest.raises(SkipDocument):
261
        page_index.prepare(page)
262

  
263
    page.public = True
264 252
    cell = TextCell(page=page, text='<p>foobar</p>', order=0)
265 253
    cell.save()
266 254

  
267
    prepared_data = page_index.prepare(page)
268
    assert 'foobar' in prepared_data['text']
255
    request = RequestFactory().get('/')
256
    request.user = AnonymousUser()
257
    hits = search_site(request, 'foobar')
258
    assert len(hits) == 0
259
    index_site()
260
    hits = search_site(request, 'foobar')
261
    assert len(hits) == 1
262

  
269 263

  
270 264
def test_search_contents_technical_placeholder():
271 265
    page = Page(title='example page', slug='example-page')
......
274 268
    TextCell(page=page, text='<p>foobar</p>', order=0, placeholder='_off').save()
275 269
    TextCell(page=page, text='<p>barfoo</p>', order=0, placeholder='on').save()
276 270

  
277
    page_index = PageIndex()
278
    prepared_data = page_index.prepare(page)
279
    assert 'barfoo' in prepared_data['text']
280
    assert not 'foobar' in prepared_data['text']
271
    request = RequestFactory().get('/')
272
    request.user = AnonymousUser()
273
    index_site()
274
    hits = search_site(request, 'foobar')
275
    assert len(hits) == 0
276
    hits = search_site(request, 'barfoo')
277
    assert len(hits) == 1
278

  
281 279

  
282 280
def test_search_api(app):
283 281
    page = Page(title='example page', slug='example-page')
......
291 289

  
292 290
    cell = TextCell(page=second_page, text='<p>other baz</p>', order=0)
293 291
    cell.save()
294

  
295
    page_index = PageIndex()
296
    page_index.reindex()
297

  
298
    resp = app.get('/api/search/?q=foobar', status=404)
292
    index_site()
299 293

  
300 294
    cell = SearchCell(page=page, _search_services={'data': ['_text']}, order=0)
301 295
    cell.save()
302 296

  
303
    resp = app.get('/api/search/?q=foobar', status=200)
304
    assert len(resp.json['data']) == 1
305
    assert resp.json['data'][0]['text'] == 'example page'
297
    resp = app.get('/ajax/search/%s/_text/?q=foobar' % cell.id, status=200)
298
    assert resp.text.count('<li') == 1
299
    assert 'example page' in resp.text
306 300

  
307
    resp = app.get('/api/search/?q=other', status=200)
308
    assert len(resp.json['data']) == 1
309
    assert resp.json['data'][0]['text'] == 'second page'
301
    resp = app.get('/ajax/search/%s/_text/?q=other' % cell.id, status=200)
302
    assert resp.text.count('<li') == 1
303
    assert 'second page' in resp.text
310 304

  
311
    resp = app.get('/api/search/?q=baz', status=200)
312
    assert len(resp.json['data']) == 2
305
    resp = app.get('/ajax/search/%s/_text/?q=baz' % cell.id, status=200)
306
    assert resp.text.count('<li') == 2
313 307

  
314
    resp = app.get('/api/search/?q=quux', status=200)
315
    assert len(resp.json['data']) == 0
308
    resp = app.get('/ajax/search/%s/_text/?q=quux' % cell.id, status=200)
309
    assert resp.text.count('<li') == 0
316 310

  
317
def test_update_index_command(app):
318
    call_command('clear_index', interactive=False)
319
    call_command('update_index') # empty site
320 311

  
312
def test_search_external_links(app):
321 313
    page = Page(title='example page', slug='example-page')
322 314
    page.save()
323 315

  
324 316
    cell = SearchCell(page=page, _search_services={'data': ['_text']}, order=0)
325 317
    cell.save()
326 318

  
327
    call_command('update_index')
328
    resp = app.get('/api/search/?q=foobar', status=200)
329
    assert len(resp.json['data']) == 0
319
    index_site()
320
    request = RequestFactory().get('/')
321
    request.user = AnonymousUser()
322
    hits = search_site(request, 'foobar')
323
    assert len(hits) == 0
330 324

  
331 325
    LinkCell(title='foobar', url='http://example.net', page=page, order=0).save()
332
    call_command('update_index')
326
    index_site()
333 327

  
334
    resp = app.get('/api/search/?q=foobar', status=200)
335
    assert len(resp.json['data']) == 1
336
    assert resp.json['data'][0]['text'] == 'foobar'
337
    assert resp.json['data'][0]['description'] is None
338
    assert resp.json['data'][0]['url'] == 'http://example.net'
328
    hits = search_site(request, 'foobar')
329
    assert len(hits) == 1
330
    assert hits[0]['text'] == 'foobar'
331
    assert hits[0]['url'] == 'http://example.net'
339 332

  
333
    # second link with same target
340 334
    LinkCell(title='baz', url='http://example.net', page=page, order=0).save()
341
    call_command('update_index')
342

  
343
    resp = app.get('/api/search/?q=baz', status=200)
344
    assert len(resp.json['data']) == 1
345
    assert resp.json['data'][0]['url'] == 'http://example.net'
335
    index_site()
346 336

  
347 337
    # add a second link with the same target
348
    LinkCell(title='bar', url='http://example.net', page=page, order=0).save()
349
    call_command('update_index')
338
    hits = search_site(request, 'baz')
339
    assert len(hits) == 1
340
    assert hits[0]['text'] in ('foobar', 'baz')
341
    assert hits[0]['url'] == 'http://example.net'
342
    hits = search_site(request, 'foobar')
343
    assert len(hits) == 1
344
    assert hits[0]['text'] in ('foobar', 'baz')
345
    assert hits[0]['url'] == 'http://example.net'
350 346

  
351
    resp = app.get('/api/search/?q=baz', status=200)
352
    assert len(resp.json['data']) == 1
353
    assert resp.json['data'][0]['url'] == 'http://example.net'
354

  
355
    resp = app.get('/api/search/?q=bar', status=200)
356
    assert len(resp.json['data']) == 1
357
    assert resp.json['data'][0]['url'] == 'http://example.net'
358 347

  
359 348
def test_manager_search_cell(app, admin_user):
360 349
    Page.objects.all().delete()
......
399 388

  
400 389

  
401 390
def test_manager_waiting_index_message(app, admin_user):
402
    from haystack import connections
403
    shutil.rmtree(connections['default'].get_backend().path)
404

  
405 391
    Page.objects.all().delete()
406 392
    page = Page(title='One', slug='one', template_name='standard')
407 393
    page.save()
......
417 403
    resp = resp.form.submit().follow()
418 404
    assert 'Content indexing has been scheduled' in resp.text
419 405

  
420
    os.mkdir(connections['default'].get_backend().path)
421
    call_command('update_index')
406
    index_site()
422 407
    resp = app.get('/manage/pages/%s/' % page.id)
423 408
    assert 'Content indexing has been scheduled' not in resp.text
424 409

  
......
455 440
        page.save()
456 441
        search_engines = engines.get_engines()
457 442
        assert 'users' in search_engines.keys()
443

  
444

  
445
def test_private_search(app):
446
    page = Page(title='example page', slug='example-page')
447
    page.save()
448

  
449
    TextCell(page=page, text='<p>foobar</p>', order=0, public=False).save()
450
    TextCell(page=page, text='<p>barfoo</p>', order=0, public=True).save()
451

  
452
    request = RequestFactory().get('/')
453
    request.user = AnonymousUser()
454
    index_site()
455
    hits = search_site(request, 'foobar')
456
    assert len(hits) == 0
457
    hits = search_site(request, 'barfoo')
458
    assert len(hits) == 1
459

  
460
    request.user = User.objects.create_user(username='normal-user')
461
    hits = search_site(request, 'foobar')
462
    assert len(hits) == 1
463
    hits = search_site(request, 'barfoo')
464
    assert len(hits) == 1
465

  
466

  
467
def test_restricted_search(app):
468
    group = Group(name='plop')
469
    group.save()
470

  
471
    page = Page(title='example page', slug='example-page')
472
    page.save()
473

  
474
    cell = TextCell(page=page, text='<p>foobar</p>', order=0, public=False)
475
    cell.save()
476
    cell.groups.set([group])
477
    TextCell(page=page, text='<p>barfoo</p>', order=0, public=False).save()
478
    index_site()
479

  
480
    # first cell is restricted, it's not found
481
    request = RequestFactory().get('/')
482
    request.user = User.objects.create_user(username='normal-user')
483
    hits = search_site(request, 'foobar')
484
    assert len(hits) == 0
485
    hits = search_site(request, 'barfoo')
486
    assert len(hits) == 1
487

  
488
    page.groups.set([group])
489
    index_site()
490

  
491
    # page is restricted, no cell is found
492
    hits = search_site(request, 'foobar')
493
    assert len(hits) == 0
494
    hits = search_site(request, 'barfoo')
495
    assert len(hits) == 0
496

  
497
    # user is in group, gets a result
498
    request.user.groups.set([group])
499
    hits = search_site(request, 'foobar')
500
    assert len(hits) == 1
501
    hits = search_site(request, 'barfoo')
502
    assert len(hits) == 1
503

  
504
    # cell is excluded from group view
505
    cell.restricted_to_unlogged = True
506
    cell.save()
507
    index_site()
508

  
509
    hits = search_site(request, 'foobar')
510
    assert len(hits) == 0
511
    hits = search_site(request, 'barfoo')
512
    assert len(hits) == 1
458
-