Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 16 additions & 17 deletions pydrive2/fs/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,17 +245,18 @@ def split_path(self, path):
@wrap_prop(threading.RLock())
@cached_property
def _ids_cache(self):
cache = {
"dirs": defaultdict(list),
"ids": {},
"root_id": self._get_item_id(
self.path,
use_cache=False,
hint="Confirm the directory exists and you can access it.",
),
}
cache = {"dirs": defaultdict(list), "ids": {}}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

C: had to cleanup this a bit (root id is not used anymore, also we now cache ids not a single id even for the base


base_item_ids = self._path_to_item_ids(self.base, use_cache=False)
if not base_item_ids:
raise FileNotFoundError(
errno.ENOENT,
os.strerror(errno.ENOENT),
f"Confirm {self.path} exists and you can access it",
)

self._cache_path_id(self.base, *base_item_ids, cache=cache)

self._cache_path_id(self.base, cache["root_id"], cache=cache)
return cache

def _cache_path_id(self, path, *item_ids, cache=None):
Expand Down Expand Up @@ -366,7 +367,7 @@ def _path_to_item_ids(self, path, create=False, use_cache=True):
[self._create_dir(min(parent_ids), title, path)] if create else []
)

def _get_item_id(self, path, create=False, use_cache=True, hint=None):
def _get_item_id(self, path, create=False, use_cache=True):
bucket, base = self.split_path(path)
assert bucket == self.root

Expand All @@ -375,9 +376,7 @@ def _get_item_id(self, path, create=False, use_cache=True, hint=None):
return min(item_ids)

assert not create
raise FileNotFoundError(
errno.ENOENT, os.strerror(errno.ENOENT), hint or path
)
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)

@_gdrive_retry
def _gdrive_create_dir(self, parent_id, title):
Expand Down Expand Up @@ -427,9 +426,9 @@ def info(self, path):

def ls(self, path, detail=False):
bucket, base = self.split_path(path)
assert bucket == self.root

dir_ids = self._path_to_item_ids(base)

if not dir_ids:
raise FileNotFoundError(
errno.ENOENT, os.strerror(errno.ENOENT), path
Expand Down Expand Up @@ -465,14 +464,14 @@ def ls(self, path, detail=False):

def find(self, path, detail=False, **kwargs):
bucket, base = self.split_path(path)

seen_paths = set()
assert bucket == self.root

# Make sure the base path is cached and dir_ids below has some
# dirs revelant to this call
self._path_to_item_ids(base)

dir_ids = [self._ids_cache["ids"].copy()]
seen_paths = set()
contents = []
while dir_ids:
query_ids = {
Expand Down
174 changes: 151 additions & 23 deletions pydrive2/test/test_fs.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from io import StringIO
import os
import posixpath
import secrets
Expand Down Expand Up @@ -25,17 +26,39 @@ def remote_dir(base_remote_dir):
return base_remote_dir + "/" + str(uuid.uuid4())


@pytest.fixture
def fs(tmpdir, base_remote_dir):
@pytest.fixture(scope="module")
def service_auth(tmp_path_factory):
setup_credentials()
auth = GoogleAuth(settings_file_path("default.yaml", tmpdir / ""))
tmpdir = tmp_path_factory.mktemp("settings")
auth = GoogleAuth(settings_file_path("default.yaml", wkdir=tmpdir))
auth.ServiceAuth()
return auth


@pytest.fixture(scope="module")
def fs_factory(base_remote_dir, service_auth):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am still not sure if we really need fs_factory() fixture. Readability and straightforward test is more important than the duplication.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, what is the alternative? I can still explore that in the followup ...

Copy link
Contributor

@skshetry skshetry Jan 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can initialize the base_remote_dir automatically in another fixture (with module scope and use it in fs. This will only be called once.

@pytest.fixture(scope="module")
def base_dir(base_remote_dir):
    fs = GDriveFileSystem(base_remote_dir, service_auth)
    fs._gdrive_create_dir(...)
    yield remote_dir
    # cleanup

@pytest.fixture
def fs(base_dir):
   ...

Then, you can just do fs = GDriveFileSystem(base_remote_dir, service_auth) wherever you are doing fs_factory(), right? Or, did I miss something here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay, I see. There are a few other things (e.g. I'm disabling fs caching in the fixture), probably it all can be moved to the base_dir or something, but I'm not sure it make a big difference

it's easier tbh for me to have all this logic in one place - easier to do changes.

base_item = None
GDriveFileSystem.cachable = False

def _create_fs():
nonlocal base_item
_, base = base_remote_dir.split("/", 1)
fs = GDriveFileSystem(base_remote_dir, service_auth)
if base_item is None:
base_item = fs._gdrive_create_dir("root", base)

return fs, base_item

yield _create_fs

GDriveFileSystem.cachable = True
fs = GDriveFileSystem(base_remote_dir, service_auth)
fs.rm_file(base_remote_dir)

bucket, base = base_remote_dir.split("/", 1)
fs = GDriveFileSystem(base_remote_dir, auth)
fs._gdrive_create_dir("root", base)

return fs
@pytest.fixture
def fs(fs_factory):
return fs_factory()[0]


@pytest.mark.manual
Expand Down Expand Up @@ -66,7 +89,7 @@ def test_fs_service_json(base_remote_dir):
)


def test_info(fs, tmpdir, remote_dir):
def test_info(fs, remote_dir):
fs.touch(remote_dir + "/info/a.txt")
fs.touch(remote_dir + "/info/b.txt")
details = fs.info(remote_dir + "/info/a.txt")
Expand Down Expand Up @@ -116,20 +139,20 @@ def test_rm(fs, remote_dir):
assert not fs.exists(remote_dir + "/dir/c/a")


def test_ls(fs: GDriveFileSystem, remote_dir):
_, base = fs.split_path(remote_dir + "dir/")
def test_ls(fs, remote_dir):
_, base = fs.split_path(remote_dir + "/dir/")
fs._path_to_item_ids(base, create=True)
assert fs.ls(remote_dir + "dir/") == []
assert fs.ls(remote_dir + "/dir/") == []

files = set()
for no in range(8):
file = remote_dir + f"dir/test_{no}"
file = remote_dir + f"/dir/test_{no}"
fs.touch(file)
files.add(file)

assert set(fs.ls(remote_dir + "dir/")) == files
assert set(fs.ls(remote_dir + "/dir/")) == files

dirs = fs.ls(remote_dir + "dir/", detail=True)
dirs = fs.ls(remote_dir + "/dir/", detail=True)
expected = [fs.info(file) for file in files]

def by_name(details):
Expand All @@ -141,12 +164,95 @@ def by_name(details):
assert dirs == expected


def test_basic_ops_caching(fs_factory, remote_dir, mocker):
# Internally we have to derefence names into IDs to call GDrive APIs
# we are trying hard to cache those and make sure that operations like
# exists, ls, find, etc. don't hit the API more than once per path

# ListFile (_gdrive_list) is the main operation that we use to retieve file
# metadata in all operations like find/ls/exist - etc. It should be fine as
# a basic benchmark to count those.
# Note: we can't count direct API calls since we have retries, also can't
# count even direct calls to the GDrive client - for the same reason
fs, _ = fs_factory()
spy = mocker.spy(fs, "_gdrive_list")

dir_path = remote_dir + "/a/b/c/"
file_path = dir_path + "test.txt"
fs.touch(file_path)

assert spy.call_count == 5
spy.reset_mock()

fs.exists(file_path)
assert spy.call_count == 1
spy.reset_mock()

fs.ls(remote_dir)
assert spy.call_count == 1
spy.reset_mock()

fs.ls(dir_path)
assert spy.call_count == 1
spy.reset_mock()

fs.find(dir_path)
assert spy.call_count == 1
spy.reset_mock()

fs.find(remote_dir)
assert spy.call_count == 1
spy.reset_mock()


def test_ops_work_with_duplicate_names(fs_factory, remote_dir):
fs, base_item = fs_factory()

remote_dir_item = fs._gdrive_create_dir(
base_item["id"], remote_dir.split("/")[-1]
)
dir_name = str(uuid.uuid4())
dir1 = fs._gdrive_create_dir(remote_dir_item["id"], dir_name)
dir2 = fs._gdrive_create_dir(remote_dir_item["id"], dir_name)

# Two directories were created with the same name
assert dir1["id"] != dir2["id"]

dir_path = remote_dir + "/" + dir_name + "/"

# ls returns both of them, even though the names are the same
test_fs = fs
result = test_fs.ls(remote_dir)
assert len(result) == 2
assert set(result) == {dir_path}

# ls returns both of them, even though the names are the same
test_fs, _ = fs_factory()
result = test_fs.ls(remote_dir)
assert len(result) == 2
assert set(result) == {dir_path}

for test_fs in [fs, fs_factory()[0]]:
# find by default doesn't return dirs at all
result = test_fs.find(remote_dir)
assert len(result) == 0

fs._gdrive_upload_fobj("a.txt", dir1["id"], StringIO(""))
fs._gdrive_upload_fobj("b.txt", dir2["id"], StringIO(""))

for test_fs in [fs, fs_factory()[0]]:
# now we should have both files
result = test_fs.find(remote_dir)
assert len(result) == 2
assert set(result) == {dir_path + file for file in ["a.txt", "b.txt"]}


def test_ls_non_existing_dir(fs, remote_dir):
with pytest.raises(FileNotFoundError):
fs.ls(remote_dir + "dir/")


def test_find(fs, remote_dir):
def test_find(fs, fs_factory, remote_dir):
fs.mkdir(remote_dir + "/dir")

files = [
Expand All @@ -169,12 +275,24 @@ def test_find(fs, remote_dir):
for file in files:
fs.touch(file)

assert set(fs.find(remote_dir)) == set(files)
for test_fs in [fs, fs_factory()[0]]:
# Test for https://github.com/iterative/PyDrive2/issues/229
# It must go first, so that we test with a cache miss as well
assert set(test_fs.find(remote_dir + "/dir/c/d/")) == set(
[
file
for file in files
if file.startswith(remote_dir + "/dir/c/d/")
]
)

# General find test
assert set(test_fs.find(remote_dir)) == set(files)

find_results = fs.find(remote_dir, detail=True)
info_results = [fs.info(file) for file in files]
info_results = {content["name"]: content for content in info_results}
assert find_results == info_results
find_results = test_fs.find(remote_dir, detail=True)
info_results = [test_fs.info(file) for file in files]
info_results = {content["name"]: content for content in info_results}
assert find_results == info_results


def test_exceptions(fs, tmpdir, remote_dir):
Expand All @@ -199,15 +317,20 @@ def test_open_rw(fs, remote_dir):
assert stream.read() == data


def test_concurrent_operations(fs, remote_dir):
def test_concurrent_operations(fs, fs_factory, remote_dir):
# Include an extra dir name to force upload operations creating it
# this way we can also test that only a single directory is created
# enven if multiple threads are uploading files into the same dir
dir_name = secrets.token_hex(16)

def create_random_file():
name = secrets.token_hex(16)
with fs.open(remote_dir + "/" + name, "w") as stream:
with fs.open(remote_dir + f"/{dir_name}/" + name, "w") as stream:
stream.write(name)
return name

def read_random_file(name):
with fs.open(remote_dir + "/" + name, "r") as stream:
with fs.open(remote_dir + f"/{dir_name}/" + name, "r") as stream:
return stream.read()

with futures.ThreadPoolExecutor() as executor:
Expand All @@ -225,6 +348,11 @@ def read_random_file(name):

assert write_names == read_names

# Test that only a single dir is cretead
for test_fs in [fs, fs_factory()[0]]:
results = test_fs.ls(remote_dir)
assert results == [remote_dir + f"/{dir_name}/"]


def test_put_file(fs, tmpdir, remote_dir):
src_file = tmpdir / "a.txt"
Expand Down
4 changes: 3 additions & 1 deletion pydrive2/test/test_util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
import random
import re
import os
Expand Down Expand Up @@ -29,7 +30,8 @@ def setup_credentials(credentials_path=DEFAULT_USER_CREDENTIALS_FILE):

def settings_file_path(settings_file, wkdir=LOCAL_PATH):
template_path = SETTINGS_PATH + settings_file
local_path = wkdir + settings_file
wkdir = Path(wkdir)
local_path = wkdir / settings_file
assert os.path.exists(template_path)
if not os.path.exists(wkdir):
os.makedirs(wkdir, exist_ok=True)
Expand Down