Skip to content

Commit 8975c0f

Browse files
committed
up docs and tests
1 parent c3f07a5 commit 8975c0f

File tree

6 files changed

+52
-127
lines changed

6 files changed

+52
-127
lines changed

docs/guides/storage_clients.mdx

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ Crawlee provides three main storage client implementations:
2828

2929
- <ApiLink to="class/FileSystemStorageClient">`FileSystemStorageClient`</ApiLink> - Provides persistent file system storage with in-memory caching.
3030
- <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> - Stores data in memory with no persistence.
31-
- <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> Provides persistent storage using a SQL database ([SQLite](https://sqlite.org/) or [PostgreSQL](https://www.postgresql.org/)). Requires installing the extra dependency: 'crawlee[sql_sqlite]' for SQLite or 'crawlee[sql_postgres]' for PostgreSQL.
32-
- <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> Provides persistent storage using a [Redis](https://redis.io/) database v8.0+. Requires installing the extra dependency `crawlee[redis]`.
31+
- <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> - Provides persistent storage using a SQL database ([SQLite](https://sqlite.org/) or [PostgreSQL](https://www.postgresql.org/)). Requires installing the extra dependency: `crawlee[sql_sqlite]` for SQLite or `crawlee[sql_postgres]` for PostgreSQL.
32+
- <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> - Provides persistent storage using a [Redis](https://redis.io/) database v8.0+. Requires installing the extra dependency `crawlee[redis]`.
3333
- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com), implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python).
3434

3535
```mermaid
@@ -310,8 +310,8 @@ Configuration options for the <ApiLink to="class/SqlStorageClient">`SqlStorageCl
310310

311311
Configuration options for the <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> can be set via constructor arguments:
312312

313-
- **`connection_string`** (default: SQLite in <ApiLink to="class/Configuration">`Configuration`</ApiLink> storage dir) SQLAlchemy connection string, e.g. `sqlite+aiosqlite:///my.db` or `postgresql+asyncpg://user:pass@host/db`.
314-
- **`engine`** Pre-configured SQLAlchemy AsyncEngine (optional).
313+
- **`connection_string`** (default: SQLite in <ApiLink to="class/Configuration">`Configuration`</ApiLink> storage dir) - SQLAlchemy connection string, e.g. `sqlite+aiosqlite:///my.db` or `postgresql+asyncpg://user:pass@host/db`.
314+
- **`engine`** - Pre-configured SQLAlchemy AsyncEngine (optional).
315315

316316
For advanced scenarios, you can configure <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> with a custom SQLAlchemy engine and additional options via the <ApiLink to="class/Configuration">`Configuration`</ApiLink> class. This is useful, for example, when connecting to an external PostgreSQL database or customizing connection pooling.
317317

@@ -369,12 +369,12 @@ class RedisDatasetClient {
369369
%% Dataset Keys
370370
%% ========================
371371
372-
class Dataset_Keys {
372+
class DatasetKeys {
373373
datasets:[name]:items - JSON Array
374374
datasets:[name]:metadata - JSON Object
375375
}
376376
377-
class Datasets_Indexes {
377+
class DatasetsIndexes {
378378
datasets:id_to_name - Hash
379379
datasets:name_to_id - Hash
380380
}
@@ -383,8 +383,8 @@ class Datasets_Indexes {
383383
%% Client to Keys arrows
384384
%% ========================
385385
386-
RedisDatasetClient --> Dataset_Keys
387-
RedisDatasetClient --> Datasets_Indexes
386+
RedisDatasetClient --> DatasetKeys
387+
RedisDatasetClient --> DatasetsIndexes
388388
```
389389

390390
```mermaid
@@ -408,13 +408,13 @@ class RedisKeyValueStoreClient {
408408
%% Key-Value Store Keys
409409
%% ========================
410410
411-
class Key_Value_Store_Keys {
411+
class KeyValueStoreKeys {
412412
key_value_stores:[name]:items - Hash
413413
key_value_stores:[name]:metadata_items - Hash
414414
key_value_stores:[name]:metadata - JSON Object
415415
}
416416
417-
class Key_Value_Stores_Indexes {
417+
class KeyValueStoresIndexes {
418418
key_value_stores:id_to_name - Hash
419419
key_value_stores:name_to_id - Hash
420420
}
@@ -423,8 +423,8 @@ class Key_Value_Stores_Indexes {
423423
%% Client to Keys arrows
424424
%% ========================
425425
426-
RedisKeyValueStoreClient --> Key_Value_Store_Keys
427-
RedisKeyValueStoreClient --> Key_Value_Stores_Indexes
426+
RedisKeyValueStoreClient --> KeyValueStoreKeys
427+
RedisKeyValueStoreClient --> KeyValueStoresIndexes
428428
```
429429

430430
```mermaid
@@ -448,7 +448,7 @@ class RedisRequestQueueClient {
448448
%% Request Queue Keys
449449
%% ========================
450450
451-
class Request_Queue_Keys{
451+
class RequestQueueKeys{
452452
request_queues:[name]:queue - List
453453
request_queues:[name]:data - Hash
454454
request_queues:[name]:in_progress - Hash
@@ -459,7 +459,7 @@ class Request_Queue_Keys{
459459
request_queues:[name]:metadata - JSON Object
460460
}
461461
462-
class Request_Queues_Indexes {
462+
class RequestQueuesIndexes {
463463
request_queues:id_to_name - Hash
464464
request_queues:name_to_id - Hash
465465
}
@@ -468,8 +468,8 @@ class Request_Queues_Indexes {
468468
%% Client to Keys arrows
469469
%% ========================
470470
471-
RedisRequestQueueClient --> Request_Queue_Keys
472-
RedisRequestQueueClient --> Request_Queues_Indexes
471+
RedisRequestQueueClient --> RequestQueueKeys
472+
RedisRequestQueueClient --> RequestQueuesIndexes
473473
```
474474

475475
Configuration options for the <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> can be set through environment variables or the <ApiLink to="class/Configuration">`Configuration`</ApiLink> class:
@@ -478,8 +478,8 @@ Configuration options for the <ApiLink to="class/RedisStorageClient">`RedisStora
478478

479479
Configuration options for the <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> can be set via constructor arguments:
480480

481-
- **`connection_string`** Redis connection string, e.g. `redis://localhost:6379/0`.
482-
- **`redis`** Pre-configured Redis client instance (optional).
481+
- **`connection_string`** - Redis connection string, e.g. `redis://localhost:6379/0`.
482+
- **`redis`** - Pre-configured Redis client instance (optional).
483483

484484
<CodeBlock className="language-python" language="python">
485485
{RedisStorageClientConfigurationExample}

tests/unit/storage_clients/_redis/test_redis_dataset_client.py

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -144,36 +144,3 @@ async def test_metadata_record_updates(dataset_client: RedisDatasetClient) -> No
144144
assert metadata.created_at == initial_created
145145
assert metadata.modified_at > initial_modified
146146
assert metadata.accessed_at > accessed_after_get
147-
148-
# Verify metadata file is updated in Redis
149-
metadata_json = await await_redis_response(dataset_client.redis.json().get(f'datasets:{metadata.name}:metadata'))
150-
151-
assert isinstance(metadata_json, dict)
152-
assert metadata_json['item_count'] == 1 # type: ignore[unreachable] # py-json typing is broken
153-
154-
155-
@pytest.mark.usefixtures('suppress_user_warning')
156-
async def test_data_persistence_across_reopens(redis_client: FakeAsyncRedis) -> None:
157-
"""Test that data persists correctly when reopening the same dataset."""
158-
storage_client = RedisStorageClient(redis=redis_client)
159-
160-
# Create dataset and add data
161-
original_client = await storage_client.create_dataset_client(
162-
name='persistence-test',
163-
)
164-
165-
test_data = {'test_item': 'test_value', 'id': 123}
166-
await original_client.push_data(test_data)
167-
168-
dataset_id = (await original_client.get_metadata()).id
169-
170-
# Reopen by ID and verify data persists
171-
reopened_client = await storage_client.create_dataset_client(
172-
id=dataset_id,
173-
)
174-
175-
data = await reopened_client.get_data()
176-
assert len(data.items) == 1
177-
assert data.items[0] == test_data
178-
179-
await reopened_client.drop()

tests/unit/storage_clients/_redis/test_redis_kvs_client.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -215,31 +215,3 @@ async def test_metadata_record_updates(kvs_client: RedisKeyValueStoreClient) ->
215215
assert metadata.created_at == initial_created
216216
assert metadata.modified_at > initial_modified
217217
assert metadata.accessed_at > accessed_after_read
218-
219-
220-
@pytest.mark.usefixtures('suppress_user_warning')
221-
async def test_data_persistence_across_reopens(redis_client: FakeAsyncRedis) -> None:
222-
"""Test that data persists correctly when reopening the same KVS."""
223-
storage_client = RedisStorageClient(redis=redis_client)
224-
225-
# Create KVS and add data
226-
original_client = await storage_client.create_kvs_client(
227-
name='persistence-test',
228-
)
229-
230-
test_key = 'persistent-key'
231-
test_value = 'persistent-value'
232-
await original_client.set_value(key=test_key, value=test_value)
233-
234-
kvs_id = (await original_client.get_metadata()).id
235-
236-
# Reopen by ID and verify data persists
237-
reopened_client = await storage_client.create_kvs_client(
238-
id=kvs_id,
239-
)
240-
241-
record = await reopened_client.get_value(key=test_key)
242-
assert record is not None
243-
assert record.value == test_value
244-
245-
await reopened_client.drop()

tests/unit/storage_clients/_redis/test_redis_rq_client.py

Lines changed: 30 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -199,48 +199,6 @@ async def test_metadata_file_updates(rq_client: RedisRequestQueueClient) -> None
199199
assert metadata.modified_at > initial_modified
200200
assert metadata.accessed_at > accessed_after_read
201201

202-
# Verify metadata file is updated in Redis
203-
metadata_json = await await_redis_response(rq_client.redis.json().get('request_queues:test_request_queue:metadata'))
204-
assert isinstance(metadata_json, dict)
205-
assert metadata_json['total_request_count'] == 1 # type: ignore[unreachable] # py-json typing is broken
206-
207-
208-
@pytest.mark.usefixtures('suppress_user_warning')
209-
async def test_data_persistence_across_reopens(redis_client: FakeAsyncRedis) -> None:
210-
"""Test that requests persist correctly when reopening the same RQ."""
211-
storage_client = RedisStorageClient(redis=redis_client)
212-
213-
# Create RQ and add requests
214-
original_client = await storage_client.create_rq_client(
215-
name='persistence-test',
216-
)
217-
218-
test_requests = [
219-
Request.from_url('https://example.com/1'),
220-
Request.from_url('https://example.com/2'),
221-
]
222-
await original_client.add_batch_of_requests(test_requests)
223-
224-
rq_id = (await original_client.get_metadata()).id
225-
226-
# Reopen by ID and verify requests persist
227-
reopened_client = await storage_client.create_rq_client(
228-
id=rq_id,
229-
)
230-
231-
metadata = await reopened_client.get_metadata()
232-
assert metadata.total_request_count == 2
233-
234-
# Fetch requests to verify they're still there
235-
request1 = await reopened_client.fetch_next_request()
236-
request2 = await reopened_client.fetch_next_request()
237-
238-
assert request1 is not None
239-
assert request2 is not None
240-
assert {request1.url, request2.url} == {'https://example.com/1', 'https://example.com/2'}
241-
242-
await reopened_client.drop()
243-
244202

245203
async def test_get_request(rq_client: RedisRequestQueueClient) -> None:
246204
"""Test that get_request works correctly."""
@@ -262,3 +220,33 @@ async def test_get_request(rq_client: RedisRequestQueueClient) -> None:
262220
# Test fetching a non-existent request
263221
non_existent = await rq_client.get_request('non-existent-id')
264222
assert non_existent is None
223+
224+
225+
async def test_deduplication(rq_client: RedisRequestQueueClient) -> None:
226+
"""Test that request deduplication works correctly."""
227+
requests = [
228+
Request.from_url('https://example.com/1'),
229+
Request.from_url('https://example.com/1'),
230+
Request.from_url('https://example.com/3'),
231+
]
232+
233+
await rq_client.add_batch_of_requests(requests)
234+
235+
# Verify only unique requests are added
236+
metadata = await rq_client.get_metadata()
237+
assert metadata.pending_request_count == 2
238+
assert metadata.total_request_count == 2
239+
240+
# Fetch requests and verify order
241+
request1 = await rq_client.fetch_next_request()
242+
assert request1 is not None
243+
assert request1 == requests[0]
244+
245+
# Fetch the next request, which should skip the duplicate
246+
request2 = await rq_client.fetch_next_request()
247+
assert request2 is not None
248+
assert request2 == requests[2]
249+
250+
# Verify no more requests are available
251+
request3 = await rq_client.fetch_next_request()
252+
assert request3 is None

tests/unit/storages/conftest.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from fakeredis import FakeAsyncRedis
1818

1919

20-
@pytest.fixture(params=['memory', 'file_system', 'sql', 'redis_default', 'redis_bloom'])
20+
@pytest.fixture(params=['memory', 'file_system', 'sql', 'redis'])
2121
def storage_client(
2222
request: pytest.FixtureRequest,
2323
redis_client: FakeAsyncRedis,
@@ -31,10 +31,8 @@ def storage_client(
3131
storage_client = MemoryStorageClient()
3232
elif storage_type == 'sql':
3333
storage_client = SqlStorageClient()
34-
elif storage_type == 'redis_default':
35-
storage_client = RedisStorageClient(redis=redis_client, queue_dedup_strategy='default')
36-
elif storage_type == 'redis_bloom':
37-
storage_client = RedisStorageClient(redis=redis_client, queue_dedup_strategy='bloom')
34+
elif storage_type == 'redis':
35+
storage_client = RedisStorageClient(redis=redis_client)
3836
else:
3937
storage_client = FileSystemStorageClient()
4038
service_locator.set_storage_client(storage_client)

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)