Merge pull request #7634 from readthedocs/search-subprojects

Search: allow to search on different versions of subprojects
readthedocs · Nov 19, 2020 · 5234f41 · 5234f41
2 parents aa92a07 + 2f4af17
commit 5234f41
Show file tree

Hide file tree

Showing 4 changed files with 152 additions and 29 deletions.
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -1568,8 +1568,14 @@ def add_features(sender, **kwargs):
     SKIP_SYNC_VERSIONS = 'skip_sync_versions'
     CACHED_ENVIRONMENT = 'cached_environment'
     LIMIT_CONCURRENT_BUILDS = 'limit_concurrent_builds'
+
+    # Search related features
     DISABLE_SERVER_SIDE_SEARCH = 'disable_server_side_search'
     ENABLE_MKDOCS_SERVER_SIDE_SEARCH = 'enable_mkdocs_server_side_search'
+    DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
+    INDEX_FROM_HTML_FILES = 'index_from_html_files'
+    SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION = 'search_subprojects_on_default_version'
+
     FORCE_SPHINX_FROM_VENV = 'force_sphinx_from_venv'
     LIST_PACKAGES_INSTALLED_ENV = 'list_packages_installed_env'
     VCS_REMOTE_LISTING = 'vcs_remote_listing'
@@ -1578,8 +1584,6 @@ def add_features(sender, **kwargs):
     USE_SPHINX_BUILDERS = 'use_sphinx_builders'
     DEDUPLICATE_BUILDS = 'deduplicate_builds'
     USE_SPHINX_RTD_EXT_LATEST = 'rtd_sphinx_ext_latest'
-    DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
-    INDEX_FROM_HTML_FILES = 'index_from_html_files'
     DONT_CREATE_INDEX = 'dont_create_index'
     USE_NEW_PIP_RESOLVER = 'use_new_pip_resolver'
     DONT_INSTALL_LATEST_PIP = 'dont_install_latest_pip'
@@ -1667,6 +1671,8 @@ def add_features(sender, **kwargs):
             LIMIT_CONCURRENT_BUILDS,
             _('Limit the amount of concurrent builds'),
         ),
+
+        # Search related features.
         (
             DISABLE_SERVER_SIDE_SEARCH,
             _('Disable server side search'),
@@ -1675,6 +1681,22 @@ def add_features(sender, **kwargs):
             ENABLE_MKDOCS_SERVER_SIDE_SEARCH,
             _('Enable server side search for MkDocs projects'),
         ),
+        (
+            DEFAULT_TO_FUZZY_SEARCH,
+            _('Default to fuzzy search for simple search queries'),
+        ),
+        (
+            INDEX_FROM_HTML_FILES,
+            _('Index content directly from html files instead or relying in other sources'),
+        ),
+        (
+            SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION,
+            _(
+                'When searching subprojects default to its default version if it doesn\'t '
+                'have the same version as the main project'
+            ),
+        ),
+
         (
             FORCE_SPHINX_FROM_VENV,
             _('Force to use Sphinx from the current virtual environment'),
@@ -1710,14 +1732,6 @@ def add_features(sender, **kwargs):
             USE_SPHINX_RTD_EXT_LATEST,
             _('Use latest version of the Read the Docs Sphinx extension'),
         ),
-        (
-            DEFAULT_TO_FUZZY_SEARCH,
-            _('Default to fuzzy search for simple search queries'),
-        ),
-        (
-            INDEX_FROM_HTML_FILES,
-            _('Index content directly from html files instead or relying in other sources'),
-        ),
         (
             DONT_CREATE_INDEX,
             _('Do not create index.md or README.rst if the project does not have one.'),

diff --git a/readthedocs/search/api.py b/readthedocs/search/api.py
@@ -227,14 +227,26 @@ def _get_all_projects_data(self):
         subprojects = Project.objects.filter(
             superprojects__parent_id=main_project.id,
         )
-        for project in subprojects:
+        for subproject in subprojects:
             version = self._get_subproject_version(
                 version_slug=main_version.slug,
-                subproject=project,
+                subproject=subproject,
             )
+
+            # Fallback to the default version of the subproject.
+            if (
+                not version
+                and main_project.has_feature(Feature.SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION)
+                and subproject.default_version
+            ):
+                version = self._get_subproject_version(
+                    version_slug=subproject.default_version,
+                    subproject=subproject,
+                )
+
             if version and self._has_permission(self.request.user, version):
-                url = project.get_docs_url(version_slug=version.slug)
-                projects_data[project.slug] = VersionData(
+                url = subproject.get_docs_url(version_slug=version.slug)
+                projects_data[subproject.slug] = VersionData(
                     slug=version.slug,
                     doctype=version.documentation_type,
                     docs_url=url,
@@ -290,26 +302,40 @@ def get_queryset(self):
            calling ``search.execute().hits``. This is why an DSL search object
            is compatible with DRF's paginator.
         """
+        main_project = self._get_project()
+        main_version = self._get_version()
+        projects = {}
         filters = {}
-        filters['project'] = list(self._get_all_projects_data().keys())
-        filters['version'] = self._get_version().slug
 
-        # Check to avoid searching all projects in case these filters are empty.
-        if not filters['project']:
-            log.info('Unable to find a project to search')
-            return []
-        if not filters['version']:
-            log.info('Unable to find a version to search')
-            return []
+        if main_project.has_feature(Feature.SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION):
+            projects = {
+                project: version.slug
+                for project, version in self._get_all_projects_data().items()
+            }
+            # Check to avoid searching all projects in case it's empty.
+            if not projects:
+                log.info('Unable to find a version to search')
+                return []
+        else:
+            filters['project'] = list(self._get_all_projects_data().keys())
+            filters['version'] = main_version.slug
+            # Check to avoid searching all projects in case these filters are empty.
+            if not filters['project']:
+                log.info('Unable to find a project to search')
+                return []
+            if not filters['version']:
+                log.info('Unable to find a version to search')
+                return []
 
         query = self.request.query_params['q']
         queryset = PageSearch(
             query=query,
+            projects=projects,
             filters=filters,
             user=self.request.user,
             # We use a permission class to control authorization
             filter_by_user=False,
-            use_advanced_query=not self._get_project().has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH),
+            use_advanced_query=not main_project.has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH),
         )
         return queryset
 

diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py
@@ -11,6 +11,7 @@
     MultiMatch,
     Nested,
     SimpleQueryString,
+    Term,
     Wildcard,
 )
 
@@ -38,12 +39,23 @@ class RTDFacetedSearch(FacetedSearch):
         'post_tags': ['</span>'],
     }
 
-    def __init__(self, query=None, filters=None, user=None, use_advanced_query=True, **kwargs):
+    def __init__(
+            self,
+            query=None,
+            filters=None,
+            projects=None,
+            user=None,
+            use_advanced_query=True,
+            **kwargs,
+    ):
         """
         Pass in a user in order to filter search results by privacy.
 
-        If `use_advanced_query` is `True`,
-        force to always use `SimpleQueryString` for the text query object.
+        :param projects: A dictionary of project slugs mapped to a `VersionData` object.
+        Results are filter with these values.
+
+        :param use_advanced_query: If `True` forces to always use
+        `SimpleQueryString` for the text query object.
 
         .. warning::
 
@@ -53,6 +65,7 @@ def __init__(self, query=None, filters=None, user=None, use_advanced_query=True,
         self.user = user
         self.filter_by_user = kwargs.pop('filter_by_user', True)
         self.use_advanced_query = use_advanced_query
+        self.projects = projects or {}
 
         # Hack a fix to our broken connection pooling
         # This creates a new connection on every request,
@@ -265,7 +278,12 @@ def total_count(self):
         return s.hits.total
 
     def query(self, search, query):
-        """Manipulates the query to support nested queries and a custom rank for pages."""
+        """
+        Manipulates the query to support nested queries and a custom rank for pages.
+
+        If `self.projects` was given, we use it to filter the documents that
+        match the same project and version.
+        """
         search = search.highlight_options(**self._highlight_options)
         search = search.source(excludes=self.excludes)
 
@@ -287,8 +305,22 @@ def query(self, search, query):
         )
 
         queries.extend([sections_nested_query, domains_nested_query])
+        bool_query = Bool(should=queries)
+
+        if self.projects:
+            versions_query = [
+                Bool(
+                    must=[
+                        Term(project={'value': project}),
+                        Term(version={'value': version}),
+                    ]
+                )
+                for project, version in self.projects.items()
+            ]
+            bool_query = Bool(must=[bool_query, Bool(should=versions_query)])
+
         final_query = FunctionScore(
-            query=Bool(should=queries),
+            query=bool_query,
             script_score=self._get_script_score(),
         )
         search = search.query(final_query)

diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py
@@ -264,11 +264,62 @@ def test_doc_search_subprojects(self, api_client, all_projects):
         # First result should be the subproject
         first_result = data[0]
         assert first_result['project'] == subproject.slug
+        # The result is from the same version as the main project.
+        assert first_result['version'] == version.slug
         # Check the link is the subproject document link
         document_link = subproject.get_docs_url(version_slug=version.slug)
         link = first_result['domain'] + first_result['path']
         assert document_link in link
 
+    def test_doc_search_subprojects_default_version(self, api_client, all_projects):
+        """Return results from subprojects that match the version from the main project or fallback to its default version."""
+        project = all_projects[0]
+        version = project.versions.all()[0]
+        feature, _ = Feature.objects.get_or_create(
+            feature_id=Feature.SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION,
+        )
+        project.feature_set.add(feature)
+
+        subproject = all_projects[1]
+        subproject_version = subproject.versions.all()[0]
+
+        # Change the name of the version, and make it default.
+        subproject_version.slug = 'different'
+        subproject_version.save()
+        subproject.default_version = subproject_version.slug
+        subproject.save()
+        subproject.versions.filter(slug=version.slug).delete()
+
+        # Refresh index
+        version_files = HTMLFile.objects.all().filter(version=subproject_version)
+        for f in version_files:
+            PageDocument().update(f)
+
+        # Add another project as subproject of the project
+        project.add_subproject(subproject)
+
+        # Now search with subproject content but explicitly filter by the parent project
+        query = get_search_query_from_project_file(project_slug=subproject.slug)
+        search_params = {
+            'q': query,
+            'project': project.slug,
+            'version': version.slug
+        }
+        resp = self.get_search(api_client, search_params)
+        assert resp.status_code == 200
+
+        data = resp.data['results']
+        assert len(data) >= 1  # there may be results from another projects
+
+        # First result should be the subproject
+        first_result = data[0]
+        assert first_result['project'] == subproject.slug
+        assert first_result['version'] == 'different'
+        # Check the link is the subproject document link
+        document_link = subproject.get_docs_url(version_slug=subproject_version.slug)
+        link = first_result['domain'] + first_result['path']
+        assert document_link in link
+
     def test_doc_search_unexisting_project(self, api_client):
         project = 'notfound'
         assert not Project.objects.filter(slug=project).exists()