Skip to content

Commit

Permalink
Merge pull request #7634 from readthedocs/search-subprojects
Browse files Browse the repository at this point in the history
Search: allow to search on different versions of subprojects
  • Loading branch information
stsewd authored Nov 19, 2020
2 parents aa92a07 + 2f4af17 commit 5234f41
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 29 deletions.
34 changes: 24 additions & 10 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1568,8 +1568,14 @@ def add_features(sender, **kwargs):
SKIP_SYNC_VERSIONS = 'skip_sync_versions'
CACHED_ENVIRONMENT = 'cached_environment'
LIMIT_CONCURRENT_BUILDS = 'limit_concurrent_builds'

# Search related features
DISABLE_SERVER_SIDE_SEARCH = 'disable_server_side_search'
ENABLE_MKDOCS_SERVER_SIDE_SEARCH = 'enable_mkdocs_server_side_search'
DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
INDEX_FROM_HTML_FILES = 'index_from_html_files'
SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION = 'search_subprojects_on_default_version'

FORCE_SPHINX_FROM_VENV = 'force_sphinx_from_venv'
LIST_PACKAGES_INSTALLED_ENV = 'list_packages_installed_env'
VCS_REMOTE_LISTING = 'vcs_remote_listing'
Expand All @@ -1578,8 +1584,6 @@ def add_features(sender, **kwargs):
USE_SPHINX_BUILDERS = 'use_sphinx_builders'
DEDUPLICATE_BUILDS = 'deduplicate_builds'
USE_SPHINX_RTD_EXT_LATEST = 'rtd_sphinx_ext_latest'
DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
INDEX_FROM_HTML_FILES = 'index_from_html_files'
DONT_CREATE_INDEX = 'dont_create_index'
USE_NEW_PIP_RESOLVER = 'use_new_pip_resolver'
DONT_INSTALL_LATEST_PIP = 'dont_install_latest_pip'
Expand Down Expand Up @@ -1667,6 +1671,8 @@ def add_features(sender, **kwargs):
LIMIT_CONCURRENT_BUILDS,
_('Limit the amount of concurrent builds'),
),

# Search related features.
(
DISABLE_SERVER_SIDE_SEARCH,
_('Disable server side search'),
Expand All @@ -1675,6 +1681,22 @@ def add_features(sender, **kwargs):
ENABLE_MKDOCS_SERVER_SIDE_SEARCH,
_('Enable server side search for MkDocs projects'),
),
(
DEFAULT_TO_FUZZY_SEARCH,
_('Default to fuzzy search for simple search queries'),
),
(
INDEX_FROM_HTML_FILES,
_('Index content directly from html files instead or relying in other sources'),
),
(
SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION,
_(
'When searching subprojects default to its default version if it doesn\'t '
'have the same version as the main project'
),
),

(
FORCE_SPHINX_FROM_VENV,
_('Force to use Sphinx from the current virtual environment'),
Expand Down Expand Up @@ -1710,14 +1732,6 @@ def add_features(sender, **kwargs):
USE_SPHINX_RTD_EXT_LATEST,
_('Use latest version of the Read the Docs Sphinx extension'),
),
(
DEFAULT_TO_FUZZY_SEARCH,
_('Default to fuzzy search for simple search queries'),
),
(
INDEX_FROM_HTML_FILES,
_('Index content directly from html files instead or relying in other sources'),
),
(
DONT_CREATE_INDEX,
_('Do not create index.md or README.rst if the project does not have one.'),
Expand Down
54 changes: 40 additions & 14 deletions readthedocs/search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,14 +227,26 @@ def _get_all_projects_data(self):
subprojects = Project.objects.filter(
superprojects__parent_id=main_project.id,
)
for project in subprojects:
for subproject in subprojects:
version = self._get_subproject_version(
version_slug=main_version.slug,
subproject=project,
subproject=subproject,
)

# Fallback to the default version of the subproject.
if (
not version
and main_project.has_feature(Feature.SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION)
and subproject.default_version
):
version = self._get_subproject_version(
version_slug=subproject.default_version,
subproject=subproject,
)

if version and self._has_permission(self.request.user, version):
url = project.get_docs_url(version_slug=version.slug)
projects_data[project.slug] = VersionData(
url = subproject.get_docs_url(version_slug=version.slug)
projects_data[subproject.slug] = VersionData(
slug=version.slug,
doctype=version.documentation_type,
docs_url=url,
Expand Down Expand Up @@ -290,26 +302,40 @@ def get_queryset(self):
calling ``search.execute().hits``. This is why an DSL search object
is compatible with DRF's paginator.
"""
main_project = self._get_project()
main_version = self._get_version()
projects = {}
filters = {}
filters['project'] = list(self._get_all_projects_data().keys())
filters['version'] = self._get_version().slug

# Check to avoid searching all projects in case these filters are empty.
if not filters['project']:
log.info('Unable to find a project to search')
return []
if not filters['version']:
log.info('Unable to find a version to search')
return []
if main_project.has_feature(Feature.SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION):
projects = {
project: version.slug
for project, version in self._get_all_projects_data().items()
}
# Check to avoid searching all projects in case it's empty.
if not projects:
log.info('Unable to find a version to search')
return []
else:
filters['project'] = list(self._get_all_projects_data().keys())
filters['version'] = main_version.slug
# Check to avoid searching all projects in case these filters are empty.
if not filters['project']:
log.info('Unable to find a project to search')
return []
if not filters['version']:
log.info('Unable to find a version to search')
return []

query = self.request.query_params['q']
queryset = PageSearch(
query=query,
projects=projects,
filters=filters,
user=self.request.user,
# We use a permission class to control authorization
filter_by_user=False,
use_advanced_query=not self._get_project().has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH),
use_advanced_query=not main_project.has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH),
)
return queryset

Expand Down
42 changes: 37 additions & 5 deletions readthedocs/search/faceted_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
MultiMatch,
Nested,
SimpleQueryString,
Term,
Wildcard,
)

Expand Down Expand Up @@ -38,12 +39,23 @@ class RTDFacetedSearch(FacetedSearch):
'post_tags': ['</span>'],
}

def __init__(self, query=None, filters=None, user=None, use_advanced_query=True, **kwargs):
def __init__(
self,
query=None,
filters=None,
projects=None,
user=None,
use_advanced_query=True,
**kwargs,
):
"""
Pass in a user in order to filter search results by privacy.
If `use_advanced_query` is `True`,
force to always use `SimpleQueryString` for the text query object.
:param projects: A dictionary of project slugs mapped to a `VersionData` object.
Results are filter with these values.
:param use_advanced_query: If `True` forces to always use
`SimpleQueryString` for the text query object.
.. warning::
Expand All @@ -53,6 +65,7 @@ def __init__(self, query=None, filters=None, user=None, use_advanced_query=True,
self.user = user
self.filter_by_user = kwargs.pop('filter_by_user', True)
self.use_advanced_query = use_advanced_query
self.projects = projects or {}

# Hack a fix to our broken connection pooling
# This creates a new connection on every request,
Expand Down Expand Up @@ -265,7 +278,12 @@ def total_count(self):
return s.hits.total

def query(self, search, query):
"""Manipulates the query to support nested queries and a custom rank for pages."""
"""
Manipulates the query to support nested queries and a custom rank for pages.
If `self.projects` was given, we use it to filter the documents that
match the same project and version.
"""
search = search.highlight_options(**self._highlight_options)
search = search.source(excludes=self.excludes)

Expand All @@ -287,8 +305,22 @@ def query(self, search, query):
)

queries.extend([sections_nested_query, domains_nested_query])
bool_query = Bool(should=queries)

if self.projects:
versions_query = [
Bool(
must=[
Term(project={'value': project}),
Term(version={'value': version}),
]
)
for project, version in self.projects.items()
]
bool_query = Bool(must=[bool_query, Bool(should=versions_query)])

final_query = FunctionScore(
query=Bool(should=queries),
query=bool_query,
script_score=self._get_script_score(),
)
search = search.query(final_query)
Expand Down
51 changes: 51 additions & 0 deletions readthedocs/search/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,62 @@ def test_doc_search_subprojects(self, api_client, all_projects):
# First result should be the subproject
first_result = data[0]
assert first_result['project'] == subproject.slug
# The result is from the same version as the main project.
assert first_result['version'] == version.slug
# Check the link is the subproject document link
document_link = subproject.get_docs_url(version_slug=version.slug)
link = first_result['domain'] + first_result['path']
assert document_link in link

def test_doc_search_subprojects_default_version(self, api_client, all_projects):
"""Return results from subprojects that match the version from the main project or fallback to its default version."""
project = all_projects[0]
version = project.versions.all()[0]
feature, _ = Feature.objects.get_or_create(
feature_id=Feature.SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION,
)
project.feature_set.add(feature)

subproject = all_projects[1]
subproject_version = subproject.versions.all()[0]

# Change the name of the version, and make it default.
subproject_version.slug = 'different'
subproject_version.save()
subproject.default_version = subproject_version.slug
subproject.save()
subproject.versions.filter(slug=version.slug).delete()

# Refresh index
version_files = HTMLFile.objects.all().filter(version=subproject_version)
for f in version_files:
PageDocument().update(f)

# Add another project as subproject of the project
project.add_subproject(subproject)

# Now search with subproject content but explicitly filter by the parent project
query = get_search_query_from_project_file(project_slug=subproject.slug)
search_params = {
'q': query,
'project': project.slug,
'version': version.slug
}
resp = self.get_search(api_client, search_params)
assert resp.status_code == 200

data = resp.data['results']
assert len(data) >= 1 # there may be results from another projects

# First result should be the subproject
first_result = data[0]
assert first_result['project'] == subproject.slug
assert first_result['version'] == 'different'
# Check the link is the subproject document link
document_link = subproject.get_docs_url(version_slug=subproject_version.slug)
link = first_result['domain'] + first_result['path']
assert document_link in link

def test_doc_search_unexisting_project(self, api_client):
project = 'notfound'
assert not Project.objects.filter(slug=project).exists()
Expand Down

0 comments on commit 5234f41

Please sign in to comment.