diff --git a/readthedocs/builds/constants.py b/readthedocs/builds/constants.py index df1bd6bf7f3..694e3ffa010 100644 --- a/readthedocs/builds/constants.py +++ b/readthedocs/builds/constants.py @@ -156,6 +156,7 @@ # All artifact types supported by Read the Docs. # They match the output directory (`_readthedocs/`) +# TODO: Remove this and use constants.MEDIA_TYPES instead ARTIFACT_TYPES = ( "html", "json", @@ -177,3 +178,11 @@ "epub", "pdf", ) + +# Part of Feature: ENABLE_MULTIPLE_PDFS +# Should replace ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT +# Currently hidden behind a feature flag +ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF = ( + "htmlzip", + "epub", +) diff --git a/readthedocs/builds/storage.py b/readthedocs/builds/storage.py index da57c40d244..a8dbe6f389b 100644 --- a/readthedocs/builds/storage.py +++ b/readthedocs/builds/storage.py @@ -132,13 +132,13 @@ def _check_suspicious_path(self, path): def _rclone(self): raise NotImplementedError - def rclone_sync_directory(self, source, destination): + def rclone_sync_directory(self, source, destination, **sync_kwargs): """Sync a directory recursively to storage using rclone sync.""" if destination in ("", "/"): raise SuspiciousFileOperation("Syncing all storage cannot be right") self._check_suspicious_path(source) - return self._rclone.sync(source, destination) + return self._rclone.sync(source, destination, **sync_kwargs) def join(self, directory, filepath): return safe_join(directory, filepath) diff --git a/readthedocs/doc_builder/backends/sphinx.py b/readthedocs/doc_builder/backends/sphinx.py index afa1c253421..dc56c436289 100644 --- a/readthedocs/doc_builder/backends/sphinx.py +++ b/readthedocs/doc_builder/backends/sphinx.py @@ -6,6 +6,7 @@ import itertools import os +import shutil from glob import glob from pathlib import Path @@ -86,6 +87,14 @@ def __init__(self, *args, **kwargs): ), self.relative_output_dir, ) + + # Isolate temporary files in the _readthedocs/ folder + # Used for new feature ENABLE_MULTIPLE_PDFS + self.absolute_host_tmp_root = os.path.join( + self.project.checkout_path(self.version.slug), + "_readthedocs/tmp", + ) + self.absolute_container_output_dir = os.path.join( "$READTHEDOCS_OUTPUT", self.relative_output_dir ) @@ -389,6 +398,7 @@ class LocalMediaBuilder(BaseSphinx): def _post_build(self): """Internal post build to create the ZIP file from the HTML output.""" + target_file = os.path.join( self.absolute_container_output_dir, # TODO: shouldn't this name include the name of the version as well? @@ -435,23 +445,24 @@ class EpubBuilder(BaseSphinx): relative_output_dir = "epub" def _post_build(self): - """Internal post build to cleanup EPUB output directory and leave only one .epub file.""" + """Internal post build to clean up EPUB output directory and leave only one .epub file.""" temp_epub_file = f"/tmp/{self.project.slug}-{self.version.slug}.epub" target_file = os.path.join( self.absolute_container_output_dir, f"{self.project.slug}.epub", ) - epub_sphinx_filepaths = glob( - os.path.join(self.absolute_host_output_dir, "*.epub") - ) + epub_sphinx_filepaths = self._get_epub_files_generated() if epub_sphinx_filepaths: # NOTE: we currently support only one .epub per version epub_filepath = epub_sphinx_filepaths[0] + # Move out the epub file self.run( "mv", epub_filepath, temp_epub_file, cwd=self.project_path, record=False ) + + # Delete everything recursively self.run( "rm", "--recursive", @@ -459,6 +470,8 @@ def _post_build(self): cwd=self.project_path, record=False, ) + + # Create everything again self.run( "mkdir", "--parents", @@ -466,10 +479,20 @@ def _post_build(self): cwd=self.project_path, record=False, ) + + # Restore the epub file self.run( "mv", temp_epub_file, target_file, cwd=self.project_path, record=False ) + def _get_epub_files_generated(self): + """ + Returns a list of *.epub generated by the sphinx build command. + + This is split out in a separate method to be test-friendly. + """ + return glob(os.path.join(self.absolute_host_output_dir, "*.epub")) + class LatexBuildCommand(BuildCommand): @@ -501,9 +524,12 @@ class PdfBuilder(BaseSphinx): relative_output_dir = "pdf" sphinx_builder = "latex" + + # TODO: Remove this when dissolving Feature.ENABLE_MULTIPLE_PDFS pdf_file_name = None def build(self): + """Runs Sphinx to convert to LaTeX, uses latexmk to build PDFs.""" self.run( *self.get_sphinx_cmd(), "-T", @@ -516,7 +542,7 @@ def build(self): f"language={self.project.language}", # Sphinx's source directory (SOURCEDIR). # We are executing this command at the location of the `conf.py` file (CWD). - # TODO: ideally we should execute it from where the repository was clonned, + # TODO: ideally we should execute it from where the repository was cloned, # but that could lead unexpected behavior to some users: # https://github.com/readthedocs/readthedocs.org/pull/9888#issuecomment-1384649346 ".", @@ -526,16 +552,28 @@ def build(self): bin_path=self.python_env.venv_bin(), ) - tex_files = glob(os.path.join(self.absolute_host_output_dir, "*.tex")) - if not tex_files: - raise BuildUserError("No TeX files were found.") + # Check that we generated .tex files. + self._check_for_files("tex") # Run LaTeX -> PDF conversions success = self._build_latexmk(self.project_path) - self._post_build() + if self.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): + self._post_build_multiple() + else: + self._post_build() return success + def _check_for_files(self, extension): + """ + Check that a file pattern exists. + + This method mostly exists so we have a pattern that is test-friend (can be mocked). + """ + tex_files = glob(os.path.join(self.absolute_host_output_dir, f"*.{extension}")) + if not tex_files: + raise BuildUserError("No *.{extension} files were found.") + def _build_latexmk(self, cwd): # These steps are copied from the Makefile generated by Sphinx >= 1.6 # https://github.com/sphinx-doc/sphinx/blob/master/sphinx/texinputs/Makefile_t @@ -585,13 +623,19 @@ def _build_latexmk(self, cwd): # When ``-f`` is used, latexmk will continue building if it # encounters errors. We still receive a failure exit code in this # case, but the correct steps should run. - '-f', - '-dvi-', - '-ps-', - f'-jobname={self.project.slug}', - '-interaction=nonstopmode', + "-f", + "-dvi-", + "-ps-", + "-interaction=nonstopmode", ] + if self.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): + cmd.append(f"-jobname={self.project.slug}_%A") + else: + cmd.append( + f"-jobname={self.project.slug}", + ) + cmd_ret = self.build_env.run_command_class( cls=latex_class, cmd=cmd, @@ -599,10 +643,24 @@ def _build_latexmk(self, cwd): cwd=self.absolute_host_output_dir, ) - self.pdf_file_name = f"{self.project.slug}.pdf" + if self.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): + pdf_files = glob(os.path.join(self.absolute_host_output_dir, "*.pdf")) + + # There is only 1 PDF file. We will call it project_slug.pdf + # This is the old behavior. + if len(pdf_files) == 1: + os.rename( + os.path.join(self.absolute_host_output_dir, pdf_files[0]), + os.path.join( + self.absolute_host_output_dir, f"{self.project.slug}.pdf" + ), + ) + else: + self.pdf_file_name = f"{self.project.slug}.pdf" return cmd_ret.successful + # Removed by Feature.ENABLE_MULTIPLE_PDFS def _post_build(self): """Internal post build to cleanup PDF output directory and leave only one .pdf file.""" @@ -649,3 +707,61 @@ def _post_build(self): self.run( "mv", temp_pdf_file, target_file, cwd=self.project_path, record=False ) + + # Introduced by Feature.ENABLE_MULTIPLE_PDFS + def _post_build_multiple(self): + """Internal post build to cleanup PDF output directory and leave only .pdf files.""" + + pdf_files = self._pdf_files_generated() + if not pdf_files: + raise PDFNotFound() + + if not os.path.exists(self.absolute_host_tmp_root): + os.makedirs(self.absolute_host_tmp_root) + + # We cannot use '*' in commands sent to the host machine, the asterisk gets escaped. + # So we opt for iterating from outside the container + pdf_file_names = [] + for fname in pdf_files: + shutil.move( + fname, + os.path.join(self.absolute_host_tmp_root, os.path.basename(fname)), + ) + pdf_file_names.append(os.path.basename(fname)) + + # Delete everything from the output dir + self.run( + "rm", + "-r", + self.absolute_host_output_dir, + cwd=self.project_path, + record=False, + ) + + # Recreate the output dir + self.run( + "mkdir", + "-p", + self.absolute_host_output_dir, + cwd=self.project_path, + record=False, + ) + + # Move the PDFs back + # Note: We cannot use '*' in commands sent to the host machine, the asterisk gets escaped. + for basename in pdf_file_names: + self.run( + "mv", + os.path.join(self.absolute_host_tmp_root, basename), + self.absolute_host_output_dir, + cwd=self.project_path, + record=False, + ) + + def _pdf_files_generated(self): + """ + Return a list of pdf files generated by the command. + + This method is mainly here to be test and mock friendly. + """ + return glob(os.path.join(self.absolute_host_output_dir, "*.pdf")) diff --git a/readthedocs/projects/constants.py b/readthedocs/projects/constants.py index 4cae14dd265..e70c7a484d1 100644 --- a/readthedocs/projects/constants.py +++ b/readthedocs/projects/constants.py @@ -47,6 +47,15 @@ MEDIA_TYPE_JSON, ) +# Map media types to their know extensions. +# This is used for validating and uploading artifacts. +MEDIA_TYPES_EXTENSIONS = { + MEDIA_TYPE_PDF: ("pdf",), + MEDIA_TYPE_EPUB: ("epub",), + MEDIA_TYPE_HTMLZIP: ("zip",), + MEDIA_TYPE_JSON: ("json",), +} + BUILD_COMMANDS_OUTPUT_PATH = "_readthedocs/" BUILD_COMMANDS_OUTPUT_PATH_HTML = os.path.join(BUILD_COMMANDS_OUTPUT_PATH, "html") diff --git a/readthedocs/projects/managers.py b/readthedocs/projects/managers.py index 995ea3da4f2..54542274913 100644 --- a/readthedocs/projects/managers.py +++ b/readthedocs/projects/managers.py @@ -1,7 +1,9 @@ from django.db import models +from .querysets import ImportedFileQuerySet + class HTMLFileManager(models.Manager): def get_queryset(self): - return super().get_queryset().filter(name__endswith='.html') + return ImportedFileQuerySet(self.model, using=self._db).html_files() diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 42287cadc61..3a5d999c13d 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -38,6 +38,7 @@ from readthedocs.projects.querysets import ( ChildRelatedProjectQuerySet, FeatureQuerySet, + ImportedFileQuerySet, ProjectQuerySet, RelatedProjectQuerySet, ) @@ -56,11 +57,13 @@ from readthedocs.vcs_support.backends import backend_cls from .constants import ( + BUILD_COMMANDS_OUTPUT_PATH, DOWNLOADABLE_MEDIA_TYPES, MEDIA_TYPE_EPUB, MEDIA_TYPE_HTMLZIP, MEDIA_TYPE_PDF, MEDIA_TYPES, + MEDIA_TYPES_EXTENSIONS, ) log = structlog.get_logger(__name__) @@ -914,10 +917,12 @@ def artifact_path(self, type_, version=LATEST): """ The path to the build docs output for the project. - :param type_: one of `html`, `json`, `htmlzip`, `pdf`, `epub`. + :param type_: A type listed in constants.MEDIA_TYPES :param version: slug of the version. """ - return os.path.join(self.checkout_path(version=version), "_readthedocs", type_) + return os.path.join( + self.checkout_path(version=version), BUILD_COMMANDS_OUTPUT_PATH, type_ + ) def conf_file(self, version=LATEST): """Find a Sphinx ``conf.py`` file in the project checkout.""" @@ -1482,7 +1487,11 @@ class ImportedFile(models.Model): # max_length is set to 4096 because linux has a maximum path length # of 4096 characters for most filesystems (including EXT4). # https://github.com/rtfd/readthedocs.org/issues/5061 - path = models.CharField(_('Path'), max_length=4096) + # The path of the file is relative to Project.artifact_path(type, version) + path = models.CharField( + _("Path"), + max_length=4096, + ) commit = models.CharField(_('Commit'), max_length=255) build = models.IntegerField(_('Build id'), null=True) modified_date = models.DateTimeField(_('Modified date'), auto_now=True) @@ -1498,6 +1507,8 @@ class ImportedFile(models.Model): null=True, ) + objects = ImportedFileQuerySet.as_manager() + def get_absolute_url(self): return resolve( project=self.project, @@ -1507,6 +1518,16 @@ def get_absolute_url(self): external=False, ) + def get_media_type(self): + """Returns a matching media type constant from constants.MEDIA_TYPES.""" + __, extension = os.path.splitext(self.name) + if not extension: + return + extension = extension.lstrip(".") + for _type, valid_extensions in MEDIA_TYPES_EXTENSIONS.items(): + if extension in valid_extensions: + return _type + def __str__(self): return '{}: {}'.format(self.name, self.project) @@ -1940,6 +1961,7 @@ def add_features(sender, **kwargs): DONT_CREATE_INDEX = "dont_create_index" HOSTING_INTEGRATIONS = "hosting_integrations" NO_CONFIG_FILE_DEPRECATED = "no_config_file" + ENABLE_MULTIPLE_PDFS = "mutliple_pdfs" FEATURES = ( ( @@ -2078,6 +2100,7 @@ def add_features(sender, **kwargs): NO_CONFIG_FILE_DEPRECATED, _("Build: Building without a configuration file is deprecated."), ), + (ENABLE_MULTIPLE_PDFS, _("Build: Enable multiple PDF support during builds.")), ) FEATURES = sorted(FEATURES, key=lambda l: l[1]) diff --git a/readthedocs/projects/querysets.py b/readthedocs/projects/querysets.py index c64e772a97a..c97914d25fc 100644 --- a/readthedocs/projects/querysets.py +++ b/readthedocs/projects/querysets.py @@ -233,3 +233,17 @@ def for_project(self, project): Q(default_true=True, add_date__gt=project.pub_date) | Q(future_default_true=True, add_date__lte=project.pub_date) ).distinct() + + +class ImportedFileQuerySet(models.QuerySet): + def html_files(self): + return self.filter(name__endswith=".html") + + def pdf_files(self): + return self.filter(name__endswith=".pdf") + + def htmlzip_files(self): + return self.filter(name__endswith=".zip") + + def epub_files(self): + return self.filter(name__endswith=".epub") diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index 70fdfb4114d..ba59f322789 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -21,6 +21,7 @@ from readthedocs.builds.constants import ( ARTIFACT_TYPES, ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT, + ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF, BUILD_FINAL_STATES, BUILD_STATE_BUILDING, BUILD_STATE_CLONING, @@ -58,15 +59,16 @@ from readthedocs.telemetry.tasks import save_build_data from readthedocs.worker import app +from ..constants import DOWNLOADABLE_MEDIA_TYPES, MEDIA_TYPES_EXTENSIONS from ..exceptions import ( ProjectConfigurationError, RepositoryError, SyncRepositoryLocked, ) -from ..models import APIProject, WebHookEvent +from ..models import APIProject, Feature, WebHookEvent from ..signals import before_vcs from .mixins import SyncRepositoryMixin -from .search import fileify +from .search import fileify, sync_downloadable_artifacts from .utils import BuildRequest, clean_build, send_external_build_status log = structlog.get_logger(__name__) @@ -542,11 +544,11 @@ def get_valid_artifact_types(self): It performs the following checks on each output format type path: - it exists - it is a directory - - does not contains more than 1 files (only PDF, HTMLZip, ePUB) + - does not contains more than 1 files (HTMLZip, ePUB) TODO: remove the limitation of only 1 file. Add support for multiple PDF files in the output directory and - grab them by using glob syntaxt between other files that could be garbage. + grab them by using glob syntax between other files that could be garbage. """ valid_artifacts = [] for artifact_type in ARTIFACT_TYPES: @@ -573,7 +575,14 @@ def get_valid_artifact_types(self): # Check if there are multiple files on artifact directories. # These output format does not support multiple files yet. # In case multiple files are found, the upload for this format is not performed. - if artifact_type in ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT: + if self.data.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): + single_file_artifacts = ( + ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF + ) + else: + single_file_artifacts = ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT + + if artifact_type in single_file_artifacts: artifact_format_files = len(os.listdir(artifact_directory)) if artifact_format_files > 1: log.error( @@ -598,6 +607,55 @@ def get_valid_artifact_types(self): return valid_artifacts + def trigger_sync_downloadable_artifacts(self, valid_artifacts): + """ + Triggers the sync_downloadable_artifacts task with the files that were found. + + Performs an additional validation on files contained in valid_artifacts. + + :param valid_artifacts: A list of artifacts allowed by initial validation. Example: + {"pdf": ["file.pdf"]} + :return: None + """ + + # We only look at artifacts that are part of DOWNLOADABLE_MEDIA_TYPES + artifact_types = set(valid_artifacts).intersection( + set(DOWNLOADABLE_MEDIA_TYPES) + ) + + artifacts_found_for_download = { + artifact_type: [] for artifact_type in artifact_types + } + + for artifact_type in artifact_types: + artifact_directory = self.data.project.artifact_path( + version=self.data.version.slug, + type_=artifact_type, + ) + artifact_directory_ls = os.listdir(artifact_directory) + log.info( + "Found artifacts in output directory", + artifact_directory=artifact_directory, + artifact_directory_ls=artifact_directory_ls, + ) + for filename in artifact_directory_ls: + log.info("Found an artifact in output directory", filename=filename) + extensions_allowed = MEDIA_TYPES_EXTENSIONS[artifact_type] + if not os.path.isfile(os.path.join(artifact_directory, filename)): + continue + if not any(filename.endswith(f".{ext}") for ext in extensions_allowed): + log.info("Illegal artifact found", filename=filename) + continue + artifacts_found_for_download[artifact_type].append(filename) + + # Store ImportedFiles for artifacts, using their paths in storage + sync_downloadable_artifacts.delay( + version_pk=self.data.version.pk, + commit=self.data.build["commit"], + build=self.data.build["id"], + artifacts_found_for_download=artifacts_found_for_download, + ) + def on_success(self, retval, task_id, args, kwargs): valid_artifacts = self.get_valid_artifact_types() @@ -633,6 +691,11 @@ def on_success(self, retval, task_id, args, kwargs): search_ignore=self.data.config.search.ignore, ) + # For the new feature, we trigger a task that will create ImportedFile objects. + if self.data.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): + if valid_artifacts: + self.trigger_sync_downloadable_artifacts(valid_artifacts) + if not self.data.project.has_valid_clone: self.set_valid_clone() @@ -736,7 +799,7 @@ def execute(self): data=self.data, ) - # Clonning + # Cloning self.update_build(state=BUILD_STATE_CLONING) # TODO: remove the ``create_vcs_environment`` hack. Ideally, this should be @@ -884,7 +947,17 @@ def store_build_artifacts(self): version_type=self.data.version.type, ) try: - build_media_storage.rclone_sync_directory(from_path, to_path) + if self.data.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): + rclone_kwargs = {} + if media_type in MEDIA_TYPES_EXTENSIONS: + rclone_kwargs["filter_extensions"] = MEDIA_TYPES_EXTENSIONS[ + media_type + ] + build_media_storage.rclone_sync_directory( + from_path, to_path, **rclone_kwargs + ) + else: + build_media_storage.rclone_sync_directory(from_path, to_path) except Exception as exc: # NOTE: the exceptions reported so far are: # - botocore.exceptions:HTTPClientError diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index ae382e8dad2..432eace0c31 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -1,3 +1,4 @@ +import os from fnmatch import fnmatch import structlog @@ -35,7 +36,7 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore): return log.info( - 'Creating ImportedFiles', + "Creating ImportedFiles for search indexing", project_slug=version.project.slug, version_slug=version.slug, ) @@ -56,6 +57,45 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore): log.exception('Failed during ImportedFile syncing') +@app.task(queue="web") +def sync_downloadable_artifacts( + version_pk, commit, build, artifacts_found_for_download +): + """ + Create ImportedFile objects for downloadable files. + + Afterwards, ImportedFile objects are used to generate a list of files that can be downloaded + for each documentation version. + + :param artifacts_found_for_download: A dictionary with a list of files for each artifact type. + For example: {"pdf": ["path/to/file1.pdf", "path/to/file2.pdf"]} + """ + version = Version.objects.get_object_or_log(pk=version_pk) + log.info( + "Creating ImportedFiles for artifact downloads", + project_slug=version.project.slug, + version_slug=version.slug, + artifacts_found_for_download=artifacts_found_for_download, + ) + + # Don't index external version builds for now + if not version or version.type == EXTERNAL: + return + + for artifact_type, artifact_paths in artifacts_found_for_download.items(): + for fpath in artifact_paths: + name = os.path.basename(fpath) + ImportedFile.objects.create( + name=name, + project=version.project, + version=version, + path=fpath, + commit=commit, + build=build, + ignore=True, + ) + + def _sync_imported_files(version, build): """ Sync/Update/Delete ImportedFiles objects of this version. diff --git a/readthedocs/projects/tests/mockers.py b/readthedocs/projects/tests/mockers.py index c86d27b6c8c..9fe8acf769b 100644 --- a/readthedocs/projects/tests/mockers.py +++ b/readthedocs/projects/tests/mockers.py @@ -75,10 +75,10 @@ def _mock_artifact_builders(self): # self.patches['builder.pdf.LatexBuildCommand.output'] = mock.patch( # 'readthedocs.doc_builder.backends.sphinx.LatexBuildCommand.output', # ) - self.patches['builder.pdf.glob'] = mock.patch( - 'readthedocs.doc_builder.backends.sphinx.glob', - return_value=['output.file'], - ) + # self.patches['builder.pdf.glob'] = mock.patch( + # 'readthedocs.doc_builder.backends.sphinx.glob', + # return_value=['output.file'], + # ) self.patches['builder.pdf.os.path.getmtime'] = mock.patch( 'readthedocs.doc_builder.backends.sphinx.os.path.getmtime', @@ -262,3 +262,8 @@ def _mock_api(self): f'{settings.SLUMBER_API_HOST}/api/v2/project/{self.project.pk}/', status_code=201, ) + + self.requestsmock.post( + f"{settings.SLUMBER_API_HOST}/api/v2/revoke/", + status_code=204, + ) diff --git a/readthedocs/projects/tests/test_build_tasks.py b/readthedocs/projects/tests/test_build_tasks.py index 7a085004ff0..c3d40ad0641 100644 --- a/readthedocs/projects/tests/test_build_tasks.py +++ b/readthedocs/projects/tests/test_build_tasks.py @@ -1,5 +1,6 @@ import os import pathlib +import tempfile from unittest import mock import django_dynamic_fixture as fixture @@ -16,10 +17,17 @@ from readthedocs.config.config import BuildConfigV2 from readthedocs.doc_builder.exceptions import BuildAppError from readthedocs.projects.exceptions import RepositoryError -from readthedocs.projects.models import EnvironmentVariable, Project, WebHookEvent +from readthedocs.projects.models import ( + EnvironmentVariable, + Feature, + ImportedFile, + Project, + WebHookEvent, +) from readthedocs.projects.tasks.builds import sync_repository_task, update_docs_task from readthedocs.telemetry.models import BuildData +from ..constants import MEDIA_TYPES_EXTENSIONS from .mockers import BuildEnvironmentMocker @@ -309,6 +317,157 @@ def test_build_updates_documentation_type(self, load_yaml_config): "has_htmlzip": False, } + @mock.patch("readthedocs.doc_builder.director.load_yaml_config") + def test_build_updates_multiple_artifacts(self, load_yaml_config): + """ + Test that Feature.ENABLE_MULTIPLE_PDFS works. + + We mock that several .pdfs were created and verify that ImportedFile objects are in place. + """ + + fixture.get( + Feature, + projects=[self.project], + feature_id=Feature.ENABLE_MULTIPLE_PDFS, + ) + + assert self.version.documentation_type == "sphinx" + load_yaml_config.return_value = self._config_file( + { + "version": 2, + "formats": ["epub", "pdf"], + "sphinx": { + "configuration": "docs/conf.py", + }, + } + ) + + # Create the artifact paths, so that `store_build_artifacts` + # properly runs: https://github.com/readthedocs/readthedocs.org/blob/faa611fad689675f81101ea643770a6b669bf529/readthedocs/projects/tasks/builds.py#L798-L804 + os.makedirs(self.project.artifact_path(version=self.version.slug, type_="html")) + + # epub and pdf output dirs + epub_dir = self.project.artifact_path(version=self.version.slug, type_="epub") + pdf_dir = self.project.artifact_path(version=self.version.slug, type_="pdf") + os.makedirs(epub_dir) + os.makedirs(pdf_dir) + + # Create a .tex file because of a validation step that verifies that intermediate + # .tex files were created. + pathlib.Path( + os.path.join( + pdf_dir, + f"{self.project.slug}-test1.tex", + ) + ).touch() + + # Create 1 epub and 2 pdfs + pathlib.Path( + os.path.join( + epub_dir, + f"{self.project.slug}.epub", + ) + ).touch() + pathlib.Path( + os.path.join( + pdf_dir, + f"{self.project.slug}-test1.pdf", + ) + ).touch() + pathlib.Path( + os.path.join( + pdf_dir, + f"{self.project.slug}-test2.pdf", + ) + ).touch() + + self._trigger_update_docs_task() + + # Update version state + assert self.requests_mock.request_history[7]._request.method == "PATCH" + assert self.requests_mock.request_history[7].path == "/api/v2/version/1/" + assert self.requests_mock.request_history[7].json() == { + "addons": False, + "build_data": None, + "built": True, + "documentation_type": "sphinx", + "has_pdf": True, + "has_epub": True, + "has_htmlzip": False, + } + + assert ( + ImportedFile.objects.pdf_files() + .filter(project=self.project, name=f"{self.project.slug}-test1.pdf") + .count() + == 1 + ) + assert ( + ImportedFile.objects.pdf_files() + .filter(project=self.project, name=f"{self.project.slug}-test2.pdf") + .count() + == 1 + ) + + @mock.patch("readthedocs.doc_builder.director.load_yaml_config") + def test_build_fail_if_no_artifacts(self, load_yaml_config): + """ + Test that Feature.ENABLE_MULTIPLE_PDFS works but fails when there are no PDFs found + + We mock that several .pdfs were created and verify that ImportedFile objects are in place. + """ + assert self.version.documentation_type == "sphinx" + + fixture.get( + Feature, + projects=[self.project], + feature_id=Feature.ENABLE_MULTIPLE_PDFS, + ) + + load_yaml_config.return_value = self._config_file( + { + "version": 2, + "formats": ["epub", "pdf"], + "sphinx": { + "configuration": "docs/conf.py", + }, + } + ) + + # Create the artifact paths, so that `store_build_artifacts` + # properly runs: https://github.com/readthedocs/readthedocs.org/blob/faa611fad689675f81101ea643770a6b669bf529/readthedocs/projects/tasks/builds.py#L798-L804 + os.makedirs(self.project.artifact_path(version=self.version.slug, type_="html")) + + # epub and pdf output dirs + epub_dir = self.project.artifact_path(version=self.version.slug, type_="epub") + pdf_dir = self.project.artifact_path(version=self.version.slug, type_="pdf") + os.makedirs(epub_dir) + os.makedirs(pdf_dir) + + # Create a .tex file because of a validation step that verifies that intermediate + # .tex files were created. + pathlib.Path( + os.path.join( + pdf_dir, + f"{self.project.slug}-test1.tex", + ) + ).touch() + + # Create 1 epub and 0 pdfs + pathlib.Path( + os.path.join( + epub_dir, + f"{self.project.slug}.epub", + ) + ).touch() + + self._trigger_update_docs_task() + + output_json = self.requests_mock.request_history[6].json() + + assert "error" in output_json + assert "PDF file was not generated" in output_json["error"] + @pytest.mark.parametrize( "config", [ @@ -404,10 +563,12 @@ def test_get_env_vars(self, load_yaml_config, build_environment, config, externa @mock.patch("readthedocs.projects.tasks.builds.send_external_build_status") @mock.patch("readthedocs.projects.tasks.builds.UpdateDocsTask.send_notifications") @mock.patch("readthedocs.projects.tasks.builds.clean_build") + @mock.patch("readthedocs.doc_builder.backends.sphinx.PdfBuilder._check_for_files") @mock.patch("readthedocs.doc_builder.director.load_yaml_config") def test_successful_build( self, load_yaml_config, + _check_for_files, clean_build, send_notifications, send_external_build_status, @@ -667,10 +828,22 @@ def test_failed_build( assert revoke_key_request.path == "/api/v2/revoke/" @mock.patch("readthedocs.doc_builder.director.load_yaml_config") + @mock.patch("readthedocs.doc_builder.backends.sphinx.PdfBuilder._check_for_files") + @mock.patch( + "readthedocs.doc_builder.backends.sphinx.EpubBuilder._get_epub_files_generated" + ) def test_build_commands_executed( self, + _get_epub_files_generated, + _check_for_files, load_yaml_config, ): + # This test only works without the feature flag + # Several other tests are also made before this feature flag, so we will have to + # adapt them later -- this one will behave differently for sure, so let's make + # sure no one ends up on a long horrible test assertion debugging trip. + assert not self.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS) + load_yaml_config.return_value = self._config_file( { "version": 2, @@ -680,15 +853,19 @@ def test_build_commands_executed( }, } ) + _get_epub_files_generated.return_value = ["some file names that are not used"] # Create the artifact paths, so it's detected by the builder os.makedirs(self.project.artifact_path(version=self.version.slug, type_="html")) - os.makedirs(self.project.artifact_path(version=self.version.slug, type_="json")) - os.makedirs( - self.project.artifact_path(version=self.version.slug, type_="htmlzip") - ) - os.makedirs(self.project.artifact_path(version=self.version.slug, type_="epub")) - os.makedirs(self.project.artifact_path(version=self.version.slug, type_="pdf")) + for f in ("epub", "pdf", "htmlzip", "json"): + extension = MEDIA_TYPES_EXTENSIONS[f] + os.makedirs(self.project.artifact_path(version=self.version.slug, type_=f)) + pathlib.Path( + os.path.join( + self.project.artifact_path(version=self.version.slug, type_=f), + f"{self.project.slug}.{extension}", + ) + ).touch() self._trigger_update_docs_task() @@ -912,6 +1089,285 @@ def test_build_commands_executed( ] ) + @mock.patch("readthedocs.doc_builder.director.load_yaml_config") + @mock.patch("readthedocs.doc_builder.backends.sphinx.PdfBuilder._check_for_files") + @mock.patch( + "readthedocs.doc_builder.backends.sphinx.PdfBuilder._pdf_files_generated" + ) + @mock.patch( + "readthedocs.doc_builder.backends.sphinx.EpubBuilder._get_epub_files_generated" + ) + def test_build_commands_executed_multiple_artifacts( + self, + _get_epub_files_generated, + _pdf_files_generated, + _check_for_files, + load_yaml_config, + ): + """ + This is the new version of test_build_commands_executed with ENABLE_MULTIPLE_PDFS enabled. + """ + fixture.get( + Feature, projects=[self.project], feature_id=Feature.ENABLE_MULTIPLE_PDFS + ) + + load_yaml_config.return_value = self._config_file( + { + "version": 2, + "formats": "all", + "sphinx": { + "configuration": "docs/conf.py", + }, + } + ) + _get_epub_files_generated.return_value = ["output.epub"] + + # Generate fake PDF output + tempdir_for_mock = tempfile.mkdtemp() + open(f"{tempdir_for_mock}/output.pdf", "w").write("") + _pdf_files_generated.return_value = [f"{tempdir_for_mock}/output.pdf"] + + # Create the artifact paths, so it's detected by the builder + os.makedirs(self.project.artifact_path(version=self.version.slug, type_="html")) + for f in ("epub", "pdf", "htmlzip", "json"): + extension = MEDIA_TYPES_EXTENSIONS[f] + os.makedirs(self.project.artifact_path(version=self.version.slug, type_=f)) + pathlib.Path( + os.path.join( + self.project.artifact_path(version=self.version.slug, type_=f), + f"{self.project.slug}.{extension}", + ) + ).touch() + + self._trigger_update_docs_task() + + self.mocker.mocks["git.Backend.run"].assert_has_calls( + [ + mock.call( + "git", "clone", "--no-single-branch", "--depth", "50", mock.ANY, "." + ), + mock.call("git", "checkout", "--force", "a1b2c3"), + mock.call("git", "clean", "-d", "-f", "-f"), + ] + ) + + self.mocker.mocks["environment.run"].assert_has_calls( + [ + mock.call( + "python3.7", + "-mvirtualenv", + mock.ANY, + bin_path=None, + cwd=None, + ), + mock.call( + mock.ANY, + "-m", + "pip", + "install", + "--upgrade", + "--no-cache-dir", + "pip", + "setuptools", + bin_path=mock.ANY, + cwd=mock.ANY, + ), + mock.call( + mock.ANY, + "-m", + "pip", + "install", + "--upgrade", + "--no-cache-dir", + "pillow", + "mock==1.0.1", + "alabaster>=0.7,<0.8,!=0.7.5", + "commonmark==0.9.1", + "recommonmark==0.5.0", + "sphinx<2", + "sphinx-rtd-theme<0.5", + "readthedocs-sphinx-ext<2.3", + "jinja2<3.1.0", + bin_path=mock.ANY, + cwd=mock.ANY, + ), + # FIXME: shouldn't this one be present here? It's not now because + # we are mocking `append_conf` which is the one that triggers this + # command. + # + # mock.call( + # 'cat', + # 'docs/conf.py', + # cwd=mock.ANY, + # ), + mock.call( + mock.ANY, + "-m", + "sphinx", + "-T", + "-E", + "-b", + "html", + "-d", + "_build/doctrees", + "-D", + "language=en", + ".", + "$READTHEDOCS_OUTPUT/html", + cwd=mock.ANY, + bin_path=mock.ANY, + ), + mock.call( + mock.ANY, + "-m", + "sphinx", + "-T", + "-E", + "-b", + "readthedocssinglehtmllocalmedia", + "-d", + "_build/doctrees", + "-D", + "language=en", + ".", + "$READTHEDOCS_OUTPUT/htmlzip", + cwd=mock.ANY, + bin_path=mock.ANY, + ), + mock.call( + "mktemp", + "--directory", + record=False, + ), + mock.call( + "mv", + mock.ANY, + mock.ANY, + cwd=mock.ANY, + record=False, + ), + mock.call( + "mkdir", + "--parents", + mock.ANY, + cwd=mock.ANY, + record=False, + ), + mock.call( + "zip", + "--recurse-paths", + "--symlinks", + mock.ANY, + mock.ANY, + cwd=mock.ANY, + record=False, + ), + mock.call( + mock.ANY, + "-m", + "sphinx", + "-T", + "-E", + "-b", + "latex", + "-d", + "_build/doctrees", + "-D", + "language=en", + ".", + "$READTHEDOCS_OUTPUT/pdf", + cwd=mock.ANY, + bin_path=mock.ANY, + ), + mock.call("cat", "latexmkrc", cwd=mock.ANY), + mock.call("rm", "-r", mock.ANY, cwd=mock.ANY, record=False), + mock.call("mkdir", "-p", mock.ANY, cwd=mock.ANY, record=False), + mock.call( + "mv", + mock.ANY, + mock.ANY, + cwd=mock.ANY, + record=False, + ), + mock.call( + mock.ANY, + "-m", + "sphinx", + "-T", + "-E", + "-b", + "epub", + "-d", + "_build/doctrees", + "-D", + "language=en", + ".", + "$READTHEDOCS_OUTPUT/epub", + cwd=mock.ANY, + bin_path=mock.ANY, + ), + mock.call( + "mv", + "output.epub", + "/tmp/project-latest.epub", + cwd=mock.ANY, + record=False, + ), + mock.call( + "rm", + "--recursive", + "$READTHEDOCS_OUTPUT/epub", + cwd=mock.ANY, + record=False, + ), + mock.call( + "mkdir", + "--parents", + "$READTHEDOCS_OUTPUT/epub", + cwd=mock.ANY, + record=False, + ), + mock.call( + "mv", + "/tmp/project-latest.epub", + mock.ANY, + cwd=mock.ANY, + record=False, + ), + mock.call( + "test", + "-x", + "_build/html", + record=False, + cwd=mock.ANY, + ), + # FIXME: I think we are hitting this issue here: + # https://github.com/pytest-dev/pytest-mock/issues/234 + mock.call("lsb_release", "--description", record=False, demux=True), + mock.call("python", "--version", record=False, demux=True), + mock.call( + "dpkg-query", + "--showformat", + "${package} ${version}\\n", + "--show", + record=False, + demux=True, + ), + mock.call( + "python", + "-m", + "pip", + "list", + "--pre", + "--local", + "--format", + "json", + record=False, + demux=True, + ), + ] + ) + @mock.patch("readthedocs.doc_builder.director.load_yaml_config") def test_install_apt_packages(self, load_yaml_config): config = BuildConfigV2( diff --git a/readthedocs/rtd_tests/tests/test_build_storage.py b/readthedocs/rtd_tests/tests/test_build_storage.py index 78e138e7b7e..89b645f921b 100644 --- a/readthedocs/rtd_tests/tests/test_build_storage.py +++ b/readthedocs/rtd_tests/tests/test_build_storage.py @@ -121,6 +121,46 @@ def test_copy_directory_source_outside_docroot(self): with pytest.raises(SuspiciousFileOperation, match="outside the docroot"): self.storage.copy_directory(tmp_dir, "files") + def test_sync_directory_with_filter(self): + """Test that we can as the rclone_sync_directory method to only include a specific extension.""" + tmp_files_dir = os.path.join(tempfile.mkdtemp(), "files") + # Copy files_dir (with all test files) into tmp_files_dir + shutil.copytree(files_dir, tmp_files_dir, symlinks=True) + storage_dir = "files" + + with override_settings(DOCROOT=tmp_files_dir): + self.storage.rclone_sync_directory( + tmp_files_dir, storage_dir, filter_extensions=["html"] + ) + # We only accept TOP-LEVEL files, so only test.html and not api/index.html + self.assertFileTree(storage_dir, ("test.html",)) + + # Copy in a couple of other types of files + open(os.path.join(tmp_files_dir, "test.pdf"), "w").write("test") + open(os.path.join(tmp_files_dir, "test.exe"), "w").write("test") + + # Move the .pdf file + with override_settings(DOCROOT=tmp_files_dir): + self.storage.rclone_sync_directory( + tmp_files_dir, storage_dir, filter_extensions=["pdf"] + ) + + # Test that the PDF file is now there + # ...AND the file we copied in the previos step. + self.assertFileTree( + storage_dir, + ( + "test.html", + "test.pdf", + ), + ) + + def test_sync_directory_with_restriction(self): + """Test that we can as the rclone_sync_directory method to only include a specific extension.""" + tmp_files_dir = os.path.join(tempfile.mkdtemp(), "files") + # Copy files_dir (with all test files) into tmp_files_dir + shutil.copytree(files_dir, tmp_files_dir, symlinks=True) + def test_delete_directory(self): with override_settings(DOCROOT=files_dir): self.storage.copy_directory(files_dir, "files") diff --git a/readthedocs/rtd_tests/tests/test_project.py b/readthedocs/rtd_tests/tests/test_project.py index a01c4d9837d..74bbd154645 100644 --- a/readthedocs/rtd_tests/tests/test_project.py +++ b/readthedocs/rtd_tests/tests/test_project.py @@ -22,6 +22,7 @@ from readthedocs.builds.models import Build, Version from readthedocs.oauth.services import GitHubService, GitLabService from readthedocs.projects.constants import ( + BUILD_COMMANDS_OUTPUT_PATH, GITHUB_BRAND, GITLAB_BRAND, MEDIA_TYPE_EPUB, @@ -31,7 +32,7 @@ MEDIA_TYPES, ) from readthedocs.projects.exceptions import ProjectConfigurationError -from readthedocs.projects.models import Project +from readthedocs.projects.models import ImportedFile, Project from readthedocs.projects.tasks.utils import finish_inactive_builds from readthedocs.rtd_tests.mocks.paths import fake_paths_by_regex @@ -279,6 +280,23 @@ def test_git_service_class_gitlab(self): self.pip.save() self.assertEqual(self.pip.git_service_class(), GitLabService) + def test_artifact_path(self): + """Verify that we can generate a meaningful path from known methods and constants.""" + project1 = get(Project, main_language_project=None) + + imported_pdf = get( + ImportedFile, + project=project1, + name="test1.pdf", + path="test1.pdf", + ) + media_path = project1.artifact_path(imported_pdf.get_media_type()) + + # Verify that BUILD_COMMANDS_OUTPUT_PATH is part of the path + assert media_path.endswith(f"/{BUILD_COMMANDS_OUTPUT_PATH}pdf") + # Verify that it ends with /pdf as expected + assert media_path.endswith("/pdf") + @mock.patch('readthedocs.projects.forms.trigger_build', mock.MagicMock()) class TestProjectTranslations(ProjectMixin, TestCase): diff --git a/readthedocs/rtd_tests/tests/test_project_querysets.py b/readthedocs/rtd_tests/tests/test_project_querysets.py index 0a9424b22ca..3b9960837b4 100644 --- a/readthedocs/rtd_tests/tests/test_project_querysets.py +++ b/readthedocs/rtd_tests/tests/test_project_querysets.py @@ -7,7 +7,7 @@ from readthedocs.organizations.models import Organization from readthedocs.projects.constants import PRIVATE, PUBLIC -from readthedocs.projects.models import Feature, Project +from readthedocs.projects.models import Feature, ImportedFile, Project from readthedocs.projects.querysets import ( ChildRelatedProjectQuerySet, ParentRelatedProjectQuerySet, @@ -292,3 +292,45 @@ def test_feature_multiple_projects(self): [repr(feature)], ordered=False, ) + + def test_importedfile_queryset(self): + """Verify queryset methods for ImportedFile.""" + project1 = fixture.get(Project, main_language_project=None) + + imported_pdf = get( + ImportedFile, + project=project1, + name="test1.pdf", + path="test1.pdf", + ) + + imported_epub = get( + ImportedFile, + project=project1, + name="test1.epub", + path="test1.epub", + ) + + imported_html = get( + ImportedFile, + project=project1, + name="test1.html", + path="test1.html", + ) + + imported_htmlzip = get( + ImportedFile, + project=project1, + name="test1.zip", + path="test1.zip", + ) + + assert ImportedFile.objects.html_files().count() == 1 + assert ImportedFile.objects.pdf_files().count() == 1 + assert ImportedFile.objects.epub_files().count() == 1 + assert ImportedFile.objects.htmlzip_files().count() == 1 + + assert ImportedFile.objects.html_files().first().name == "test1.html" + assert ImportedFile.objects.pdf_files().first().name == "test1.pdf" + assert ImportedFile.objects.epub_files().first().name == "test1.epub" + assert ImportedFile.objects.htmlzip_files().first().name == "test1.zip" diff --git a/readthedocs/storage/rclone.py b/readthedocs/storage/rclone.py index ab88f3fe64d..43bfc558630 100644 --- a/readthedocs/storage/rclone.py +++ b/readthedocs/storage/rclone.py @@ -112,16 +112,32 @@ def execute(self, subcommand, args, options=None): ) return result - def sync(self, source, destination): + def sync(self, source, destination, filter_extensions=None): """ Run the `rclone sync` command. - See https://rclone.org/commands/rclone_sync/. + See: + https://rclone.org/commands/rclone_sync/ + https://rclone.org/filtering/ :params source: Local path to the source directory. :params destination: Remote path to the destination directory. + :params filter_extensions: Only top-level copy files with the listed file extensions. """ - return self.execute("sync", args=[source, self.get_target(destination)]) + options = [] + + if filter_extensions: + options += ["--ignore-case", "--include"] + # Python escape rule: {{ = { + # What we want to have is for instance '/*.{pdf}' + filter_pattern = "/*.{{{extensions}}}".format( + extensions=",".join(filter_extensions) + ) + options.append(filter_pattern) + + return self.execute( + "sync", args=[source, self.get_target(destination)], options=options + ) class RCloneLocal(BaseRClone): diff --git a/readthedocs/telemetry/collectors.py b/readthedocs/telemetry/collectors.py index af1202d0cdd..09177848b16 100644 --- a/readthedocs/telemetry/collectors.py +++ b/readthedocs/telemetry/collectors.py @@ -17,7 +17,7 @@ class BuildDataCollector: """ Build data collector. - Collect data from a runnig build. + Collect data from a running build. """ def __init__(self, environment): @@ -36,7 +36,7 @@ def __init__(self, environment): @staticmethod def _safe_json_loads(content, default=None): - def lowercase(d): # pylint: disable=invalid-name + def lowercase(d): """Convert all dictionary keys to lowercase.""" return {k.lower(): i for k, i in d.items()} @@ -58,7 +58,7 @@ def run(self, *args, **kwargs): def collect(self): """ - Collect all relevant data from the runnig build. + Collect all relevant data from the running build. Data that can be extracted from the database (project/organization) isn't collected here.