From b687d835aa773e0c3bc98cb70b174271f1c91d92 Mon Sep 17 00:00:00 2001 From: Ivan Shcheklein Date: Thu, 28 Dec 2023 11:38:10 -0800 Subject: [PATCH] chore(fs): add tests to cover recent PRs --- pydrive2/fs/spec.py | 33 ++++--- pydrive2/test/test_fs.py | 174 ++++++++++++++++++++++++++++++++----- pydrive2/test/test_util.py | 4 +- 3 files changed, 170 insertions(+), 41 deletions(-) diff --git a/pydrive2/fs/spec.py b/pydrive2/fs/spec.py index 9f36b415..60b51973 100644 --- a/pydrive2/fs/spec.py +++ b/pydrive2/fs/spec.py @@ -245,17 +245,18 @@ def split_path(self, path): @wrap_prop(threading.RLock()) @cached_property def _ids_cache(self): - cache = { - "dirs": defaultdict(list), - "ids": {}, - "root_id": self._get_item_id( - self.path, - use_cache=False, - hint="Confirm the directory exists and you can access it.", - ), - } + cache = {"dirs": defaultdict(list), "ids": {}} + + base_item_ids = self._path_to_item_ids(self.base, use_cache=False) + if not base_item_ids: + raise FileNotFoundError( + errno.ENOENT, + os.strerror(errno.ENOENT), + f"Confirm {self.path} exists and you can access it", + ) + + self._cache_path_id(self.base, *base_item_ids, cache=cache) - self._cache_path_id(self.base, cache["root_id"], cache=cache) return cache def _cache_path_id(self, path, *item_ids, cache=None): @@ -366,7 +367,7 @@ def _path_to_item_ids(self, path, create=False, use_cache=True): [self._create_dir(min(parent_ids), title, path)] if create else [] ) - def _get_item_id(self, path, create=False, use_cache=True, hint=None): + def _get_item_id(self, path, create=False, use_cache=True): bucket, base = self.split_path(path) assert bucket == self.root @@ -375,9 +376,7 @@ def _get_item_id(self, path, create=False, use_cache=True, hint=None): return min(item_ids) assert not create - raise FileNotFoundError( - errno.ENOENT, os.strerror(errno.ENOENT), hint or path - ) + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path) @_gdrive_retry def _gdrive_create_dir(self, parent_id, title): @@ -427,9 +426,9 @@ def info(self, path): def ls(self, path, detail=False): bucket, base = self.split_path(path) + assert bucket == self.root dir_ids = self._path_to_item_ids(base) - if not dir_ids: raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), path @@ -465,14 +464,14 @@ def ls(self, path, detail=False): def find(self, path, detail=False, **kwargs): bucket, base = self.split_path(path) - - seen_paths = set() + assert bucket == self.root # Make sure the base path is cached and dir_ids below has some # dirs revelant to this call self._path_to_item_ids(base) dir_ids = [self._ids_cache["ids"].copy()] + seen_paths = set() contents = [] while dir_ids: query_ids = { diff --git a/pydrive2/test/test_fs.py b/pydrive2/test/test_fs.py index c895846f..483ab7eb 100644 --- a/pydrive2/test/test_fs.py +++ b/pydrive2/test/test_fs.py @@ -1,3 +1,4 @@ +from io import StringIO import os import posixpath import secrets @@ -25,17 +26,39 @@ def remote_dir(base_remote_dir): return base_remote_dir + "/" + str(uuid.uuid4()) -@pytest.fixture -def fs(tmpdir, base_remote_dir): +@pytest.fixture(scope="module") +def service_auth(tmp_path_factory): setup_credentials() - auth = GoogleAuth(settings_file_path("default.yaml", tmpdir / "")) + tmpdir = tmp_path_factory.mktemp("settings") + auth = GoogleAuth(settings_file_path("default.yaml", wkdir=tmpdir)) auth.ServiceAuth() + return auth + + +@pytest.fixture(scope="module") +def fs_factory(base_remote_dir, service_auth): + base_item = None + GDriveFileSystem.cachable = False + + def _create_fs(): + nonlocal base_item + _, base = base_remote_dir.split("/", 1) + fs = GDriveFileSystem(base_remote_dir, service_auth) + if base_item is None: + base_item = fs._gdrive_create_dir("root", base) + + return fs, base_item + + yield _create_fs + + GDriveFileSystem.cachable = True + fs = GDriveFileSystem(base_remote_dir, service_auth) + fs.rm_file(base_remote_dir) - bucket, base = base_remote_dir.split("/", 1) - fs = GDriveFileSystem(base_remote_dir, auth) - fs._gdrive_create_dir("root", base) - return fs +@pytest.fixture +def fs(fs_factory): + return fs_factory()[0] @pytest.mark.manual @@ -66,7 +89,7 @@ def test_fs_service_json(base_remote_dir): ) -def test_info(fs, tmpdir, remote_dir): +def test_info(fs, remote_dir): fs.touch(remote_dir + "/info/a.txt") fs.touch(remote_dir + "/info/b.txt") details = fs.info(remote_dir + "/info/a.txt") @@ -116,20 +139,20 @@ def test_rm(fs, remote_dir): assert not fs.exists(remote_dir + "/dir/c/a") -def test_ls(fs: GDriveFileSystem, remote_dir): - _, base = fs.split_path(remote_dir + "dir/") +def test_ls(fs, remote_dir): + _, base = fs.split_path(remote_dir + "/dir/") fs._path_to_item_ids(base, create=True) - assert fs.ls(remote_dir + "dir/") == [] + assert fs.ls(remote_dir + "/dir/") == [] files = set() for no in range(8): - file = remote_dir + f"dir/test_{no}" + file = remote_dir + f"/dir/test_{no}" fs.touch(file) files.add(file) - assert set(fs.ls(remote_dir + "dir/")) == files + assert set(fs.ls(remote_dir + "/dir/")) == files - dirs = fs.ls(remote_dir + "dir/", detail=True) + dirs = fs.ls(remote_dir + "/dir/", detail=True) expected = [fs.info(file) for file in files] def by_name(details): @@ -141,12 +164,95 @@ def by_name(details): assert dirs == expected +def test_basic_ops_caching(fs_factory, remote_dir, mocker): + # Internally we have to derefence names into IDs to call GDrive APIs + # we are trying hard to cache those and make sure that operations like + # exists, ls, find, etc. don't hit the API more than once per path + + # ListFile (_gdrive_list) is the main operation that we use to retieve file + # metadata in all operations like find/ls/exist - etc. It should be fine as + # a basic benchmark to count those. + # Note: we can't count direct API calls since we have retries, also can't + # count even direct calls to the GDrive client - for the same reason + fs, _ = fs_factory() + spy = mocker.spy(fs, "_gdrive_list") + + dir_path = remote_dir + "/a/b/c/" + file_path = dir_path + "test.txt" + fs.touch(file_path) + + assert spy.call_count == 5 + spy.reset_mock() + + fs.exists(file_path) + assert spy.call_count == 1 + spy.reset_mock() + + fs.ls(remote_dir) + assert spy.call_count == 1 + spy.reset_mock() + + fs.ls(dir_path) + assert spy.call_count == 1 + spy.reset_mock() + + fs.find(dir_path) + assert spy.call_count == 1 + spy.reset_mock() + + fs.find(remote_dir) + assert spy.call_count == 1 + spy.reset_mock() + + +def test_ops_work_with_duplicate_names(fs_factory, remote_dir): + fs, base_item = fs_factory() + + remote_dir_item = fs._gdrive_create_dir( + base_item["id"], remote_dir.split("/")[-1] + ) + dir_name = str(uuid.uuid4()) + dir1 = fs._gdrive_create_dir(remote_dir_item["id"], dir_name) + dir2 = fs._gdrive_create_dir(remote_dir_item["id"], dir_name) + + # Two directories were created with the same name + assert dir1["id"] != dir2["id"] + + dir_path = remote_dir + "/" + dir_name + "/" + + # ls returns both of them, even though the names are the same + test_fs = fs + result = test_fs.ls(remote_dir) + assert len(result) == 2 + assert set(result) == {dir_path} + + # ls returns both of them, even though the names are the same + test_fs, _ = fs_factory() + result = test_fs.ls(remote_dir) + assert len(result) == 2 + assert set(result) == {dir_path} + + for test_fs in [fs, fs_factory()[0]]: + # find by default doesn't return dirs at all + result = test_fs.find(remote_dir) + assert len(result) == 0 + + fs._gdrive_upload_fobj("a.txt", dir1["id"], StringIO("")) + fs._gdrive_upload_fobj("b.txt", dir2["id"], StringIO("")) + + for test_fs in [fs, fs_factory()[0]]: + # now we should have both files + result = test_fs.find(remote_dir) + assert len(result) == 2 + assert set(result) == {dir_path + file for file in ["a.txt", "b.txt"]} + + def test_ls_non_existing_dir(fs, remote_dir): with pytest.raises(FileNotFoundError): fs.ls(remote_dir + "dir/") -def test_find(fs, remote_dir): +def test_find(fs, fs_factory, remote_dir): fs.mkdir(remote_dir + "/dir") files = [ @@ -169,12 +275,24 @@ def test_find(fs, remote_dir): for file in files: fs.touch(file) - assert set(fs.find(remote_dir)) == set(files) + for test_fs in [fs, fs_factory()[0]]: + # Test for https://github.com/iterative/PyDrive2/issues/229 + # It must go first, so that we test with a cache miss as well + assert set(test_fs.find(remote_dir + "/dir/c/d/")) == set( + [ + file + for file in files + if file.startswith(remote_dir + "/dir/c/d/") + ] + ) + + # General find test + assert set(test_fs.find(remote_dir)) == set(files) - find_results = fs.find(remote_dir, detail=True) - info_results = [fs.info(file) for file in files] - info_results = {content["name"]: content for content in info_results} - assert find_results == info_results + find_results = test_fs.find(remote_dir, detail=True) + info_results = [test_fs.info(file) for file in files] + info_results = {content["name"]: content for content in info_results} + assert find_results == info_results def test_exceptions(fs, tmpdir, remote_dir): @@ -199,15 +317,20 @@ def test_open_rw(fs, remote_dir): assert stream.read() == data -def test_concurrent_operations(fs, remote_dir): +def test_concurrent_operations(fs, fs_factory, remote_dir): + # Include an extra dir name to force upload operations creating it + # this way we can also test that only a single directory is created + # enven if multiple threads are uploading files into the same dir + dir_name = secrets.token_hex(16) + def create_random_file(): name = secrets.token_hex(16) - with fs.open(remote_dir + "/" + name, "w") as stream: + with fs.open(remote_dir + f"/{dir_name}/" + name, "w") as stream: stream.write(name) return name def read_random_file(name): - with fs.open(remote_dir + "/" + name, "r") as stream: + with fs.open(remote_dir + f"/{dir_name}/" + name, "r") as stream: return stream.read() with futures.ThreadPoolExecutor() as executor: @@ -225,6 +348,11 @@ def read_random_file(name): assert write_names == read_names + # Test that only a single dir is cretead + for test_fs in [fs, fs_factory()[0]]: + results = test_fs.ls(remote_dir) + assert results == [remote_dir + f"/{dir_name}/"] + def test_put_file(fs, tmpdir, remote_dir): src_file = tmpdir / "a.txt" diff --git a/pydrive2/test/test_util.py b/pydrive2/test/test_util.py index 8318bfca..aeff678e 100644 --- a/pydrive2/test/test_util.py +++ b/pydrive2/test/test_util.py @@ -1,3 +1,4 @@ +from pathlib import Path import random import re import os @@ -29,7 +30,8 @@ def setup_credentials(credentials_path=DEFAULT_USER_CREDENTIALS_FILE): def settings_file_path(settings_file, wkdir=LOCAL_PATH): template_path = SETTINGS_PATH + settings_file - local_path = wkdir + settings_file + wkdir = Path(wkdir) + local_path = wkdir / settings_file assert os.path.exists(template_path) if not os.path.exists(wkdir): os.makedirs(wkdir, exist_ok=True)