Skip to content

Commit f5b2ae7

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent bc990b8 commit f5b2ae7

21 files changed

+1457
-1452
lines changed

src/vdf_io/notebooks/01_Introducing_txtai.ipynb

Lines changed: 1115 additions & 1058 deletions
Large diffs are not rendered by default.

src/vdf_io/notebooks/aiven-qs.ipynb

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
},
3434
"outputs": [],
3535
"source": [
36-
"from rich import print as rprint\n"
36+
"from rich import print as rprint"
3737
]
3838
},
3939
{
@@ -181,7 +181,10 @@
181181
"auth_provider = PlainTextAuthProvider(\n",
182182
" os.environ.get(\"CASSANDRA_USER\"), os.environ.get(\"CASSANDRA_PASSWORD\")\n",
183183
")\n",
184-
"ssl_options = {\"ca_certs\": \"/Users/dhruvanand/Code/vector-io/aiven.pem\", \"cert_reqs\": ssl.CERT_REQUIRED}\n",
184+
"ssl_options = {\n",
185+
" \"ca_certs\": \"/Users/dhruvanand/Code/vector-io/aiven.pem\",\n",
186+
" \"cert_reqs\": ssl.CERT_REQUIRED,\n",
187+
"}\n",
185188
"CASSANDRA_URI = os.environ.get(\"CASSANDRA_URI\")\n",
186189
"CASSANDRA_URI, CASSANDRA_PORT = CASSANDRA_URI.split(\":\")\n",
187190
"with Cluster(\n",

src/vdf_io/notebooks/astra_usage.ipynb

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@
162162
},
163163
"outputs": [],
164164
"source": [
165-
"table = coll.find()['data']['documents']"
165+
"table = coll.find()[\"data\"][\"documents\"]"
166166
]
167167
},
168168
{
@@ -325,6 +325,7 @@
325325
"source": [
326326
"# convert list of dicts to pd.DataFrame\n",
327327
"import pandas as pd\n",
328+
"\n",
328329
"df = pd.DataFrame(table)\n",
329330
"df.head()"
330331
]
@@ -359,7 +360,7 @@
359360
}
360361
],
361362
"source": [
362-
"len(resp['data']['documents'])"
363+
"len(resp[\"data\"][\"documents\"])"
363364
]
364365
},
365366
{
@@ -379,7 +380,7 @@
379380
}
380381
],
381382
"source": [
382-
"resp['data']['documents'][0].keys()"
383+
"resp[\"data\"][\"documents\"][0].keys()"
383384
]
384385
},
385386
{
@@ -497,14 +498,12 @@
497498
"source": [
498499
"# write to a new parquet file\n",
499500
"import pyarrow as pa\n",
500-
"import pyarrow.parquet as pq\n",
501501
"\n",
502502
"table = pa.Table.from_pandas(coll.find().to_pandas())\n",
503503
"\n",
504504
"for r in coll.paginated_find():\n",
505505
" print(type(r), r.keys())\n",
506-
" # append data into a parquet file\n",
507-
" "
506+
" # append data into a parquet file"
508507
]
509508
},
510509
{
@@ -665,9 +664,9 @@
665664
}
666665
],
667666
"source": [
668-
"i=0\n",
667+
"i = 0\n",
669668
"for r in tqdm(collection2.paginated_find()):\n",
670-
" i+=1\n",
669+
" i += 1\n",
671670
"print(i)"
672671
]
673672
},
@@ -690,7 +689,7 @@
690689
" break\n",
691690
" next_page_state = a[\"data\"][\"nextPageState\"]\n",
692691
" i += 1\n",
693-
" print(i,len(id_set), tot_docs)\n",
692+
" print(i, len(id_set), tot_docs)\n",
694693
"# len(a[\"data\"][\"documents\"])\n",
695694
"# len(id_set)"
696695
]
@@ -728,7 +727,7 @@
728727
}
729728
],
730729
"source": [
731-
"a['data']['documents'][0]"
730+
"a[\"data\"][\"documents\"][0]"
732731
]
733732
},
734733
{
@@ -748,7 +747,7 @@
748747
}
749748
],
750749
"source": [
751-
"len(collection2.find_one()['data']['document'][\"vector\"])"
750+
"len(collection2.find_one()[\"data\"][\"document\"][\"vector\"])"
752751
]
753752
},
754753
{
@@ -768,8 +767,8 @@
768767
}
769768
],
770769
"source": [
771-
"mr=db.collection(\"movie_reviews\")\n",
772-
"mr.find_one()['data']['document'].keys()"
770+
"mr = db.collection(\"movie_reviews\")\n",
771+
"mr.find_one()[\"data\"][\"document\"].keys()"
773772
]
774773
},
775774
{

src/vdf_io/notebooks/chroma-qs.ipynb

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@
191191
},
192192
"outputs": [],
193193
"source": [
194-
"import chromadb\n"
194+
"import chromadb"
195195
]
196196
},
197197
{
@@ -200,9 +200,8 @@
200200
"metadata": {},
201201
"outputs": [],
202202
"source": [
203-
"\n",
204203
"# setup Chroma in-memory, for easy prototyping. Can add persistence easily!\n",
205-
"client = chromadb.PersistentClient()\n"
204+
"client = chromadb.PersistentClient()"
206205
]
207206
},
208207
{
@@ -247,21 +246,23 @@
247246
}
248247
],
249248
"source": [
250-
"\n",
251249
"# Create collection. get_collection, get_or_create_collection, delete_collection also available!\n",
252250
"collection2 = client.get_or_create_collection(\"test\")\n",
253251
"\n",
254252
"# Add docs to the collection. Can also update and delete. Row-based API coming soon!\n",
255253
"collection2.add(\n",
256-
" documents=[\"This is document1\", \"This is document2\"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well\n",
257-
" metadatas=[{\"source\": \"notion\"}, {\"source\": \"google-docs\"}], # filter on these!\n",
258-
" ids=[\"doc1\", \"doc2\"], # unique for each doc\n",
259-
" embeddings=[[1,2,3], [4,5,6]] # optional, we can also embed for you\n",
254+
" documents=[\n",
255+
" \"This is document1\",\n",
256+
" \"This is document2\",\n",
257+
" ], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well\n",
258+
" metadatas=[{\"source\": \"notion\"}, {\"source\": \"google-docs\"}], # filter on these!\n",
259+
" ids=[\"doc1\", \"doc2\"], # unique for each doc\n",
260+
" embeddings=[[1, 2, 3], [4, 5, 6]], # optional, we can also embed for you\n",
260261
")\n",
261262
"\n",
262263
"# Query/search 2 most similar results. You can also .get by id\n",
263264
"results = collection2.query(\n",
264-
" query_embeddings=[[1,2,3]],\n",
265+
" query_embeddings=[[1, 2, 3]],\n",
265266
" n_results=2,\n",
266267
" # where={\"metadata_field\": \"is_equal_to_this\"}, # optional filter\n",
267268
" # where_document={\"$contains\":\"search_string\"} # optional filter\n",
@@ -304,7 +305,7 @@
304305
},
305306
"outputs": [],
306307
"source": [
307-
"coll=client.get_collection(\"test\")"
308+
"coll = client.get_collection(\"test\")"
308309
]
309310
},
310311
{
@@ -335,7 +336,7 @@
335336
"metadata": {},
336337
"outputs": [],
337338
"source": [
338-
"client.delete_collection(\"test\") # delete collection"
339+
"client.delete_collection(\"test\") # delete collection"
339340
]
340341
},
341342
{
@@ -357,7 +358,7 @@
357358
}
358359
],
359360
"source": [
360-
"client.list_collections() # list collections"
361+
"client.list_collections() # list collections"
361362
]
362363
},
363364
{
@@ -399,7 +400,7 @@
399400
"metadata": {},
400401
"outputs": [],
401402
"source": [
402-
"pclient = chromadb.PersistentClient(\"~/.chroma4\") # persistent client\n",
403+
"pclient = chromadb.PersistentClient(\"~/.chroma4\") # persistent client\n",
403404
"\n",
404405
"pcol = pclient.create_collection(\"test3\")"
405406
]
@@ -414,8 +415,8 @@
414415
" documents=[\"This is document1\", \"This is document2\"],\n",
415416
" metadatas=[{\"source\": \"notion\"}, {\"source\": \"google-docs\"}],\n",
416417
" ids=[\"doc1\", \"doc2\"],\n",
417-
" embeddings=[[1,2,3], [4,5,6]]\n",
418-
")\n"
418+
" embeddings=[[1, 2, 3], [4, 5, 6]],\n",
419+
")"
419420
]
420421
},
421422
{
@@ -479,7 +480,9 @@
479480
},
480481
"outputs": [],
481482
"source": [
482-
"client2= chromadb.PersistentClient(\"/Users/dhruvanand/Code/vector-io/src/vdf_io/notebooks/chroma\")\n"
483+
"client2 = chromadb.PersistentClient(\n",
484+
" \"/Users/dhruvanand/Code/vector-io/src/vdf_io/notebooks/chroma\"\n",
485+
")"
483486
]
484487
},
485488
{
@@ -559,7 +562,7 @@
559562
},
560563
"outputs": [],
561564
"source": [
562-
"coll2=client2.get_collection(\"vdf_2024_9-11\")"
565+
"coll2 = client2.get_collection(\"vdf_2024_9-11\")"
563566
]
564567
},
565568
{
@@ -653,7 +656,7 @@
653656
" documents=[\"This is document1\", \"This is document2\"],\n",
654657
" metadatas=[{\"source\": \"notion\"}, {\"source\": \"google-docs\"}],\n",
655658
" ids=[\"doc5\", \"doc6\"],\n",
656-
" embeddings=[[1,2,3], None]\n",
659+
" embeddings=[[1, 2, 3], None],\n",
657660
")"
658661
]
659662
},

src/vdf_io/notebooks/deeplake.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@
156156
"source": [
157157
"import deeplake\n",
158158
"\n",
159-
"ds = deeplake.load('hub://activeloop/coco-train')"
159+
"ds = deeplake.load(\"hub://activeloop/coco-train\")"
160160
]
161161
},
162162
{

src/vdf_io/notebooks/json_pandas.ipynb

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,34 +17,26 @@
1717
"source": [
1818
"import json\n",
1919
"import pandas as pd\n",
20-
"js=[\n",
20+
"\n",
21+
"js = [\n",
2122
" {\n",
2223
" \"id\": 1,\n",
2324
" \"name\": \"John Doe\",\n",
2425
" \"age\": 30,\n",
25-
" \"attribute\": {\n",
26-
" \"height\": 176,\n",
27-
" \"weight\": 80\n",
28-
" }\n",
26+
" \"attribute\": {\"height\": 176, \"weight\": 80},\n",
2927
" },\n",
3028
" {\n",
3129
" \"id\": 2,\n",
3230
" \"name\": \"Alice Smith\",\n",
3331
" \"age\": 28,\n",
34-
" \"attribute\": {\n",
35-
" \"height\": 167,\n",
36-
" \"weight\": 55\n",
37-
" }\n",
32+
" \"attribute\": {\"height\": 167, \"weight\": 55},\n",
3833
" },\n",
3934
" {\n",
4035
" \"id\": 3,\n",
4136
" \"name\": \"Bob Johnson\",\n",
4237
" \"age\": 35,\n",
43-
" \"attribute\": {\n",
44-
" \"height\": 192,\n",
45-
" \"weight\": 85\n",
46-
" }\n",
47-
" }\n",
38+
" \"attribute\": {\"height\": 192, \"weight\": 85},\n",
39+
" },\n",
4840
"]\n",
4941
"\n",
5042
"df = pd.read_json(json.dumps(js))"

src/vdf_io/notebooks/jsonl_to_parquet.ipynb

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@
1515
"outputs": [],
1616
"source": [
1717
"# Importing required libraries\n",
18-
"import pandas as pd\n",
19-
"import pyarrow as pa\n",
20-
"import pyarrow.parquet as pq"
18+
"import pandas as pd"
2119
]
2220
},
2321
{
@@ -35,7 +33,7 @@
3533
"outputs": [],
3634
"source": [
3735
"# Load JSONL File\n",
38-
"jsonl_file = '/Users/dhruvanand/Code/datasets-dumps/shard-00000.jsonl 2'\n"
36+
"jsonl_file = \"/Users/dhruvanand/Code/datasets-dumps/shard-00000.jsonl 2\""
3937
]
4038
},
4139
{
@@ -53,7 +51,9 @@
5351
"outputs": [],
5452
"source": [
5553
"# Convert JSONL to DataFrame\n",
56-
"df = pd.read_json(jsonl_file, lines=True) # Convert the loaded jsonl data into a pandas DataFrame"
54+
"df = pd.read_json(\n",
55+
" jsonl_file, lines=True\n",
56+
") # Convert the loaded jsonl data into a pandas DataFrame"
5757
]
5858
},
5959
{
@@ -172,7 +172,7 @@
172172
}
173173
],
174174
"source": [
175-
"df['metadata'].iloc[3]"
175+
"df[\"metadata\"].iloc[3]"
176176
]
177177
},
178178
{
@@ -213,9 +213,9 @@
213213
"outputs": [],
214214
"source": [
215215
"# Save DataFrame as Parquet\n",
216-
"for comp in ['snappy', 'gzip', 'brotli', 'zstd']:\n",
217-
" parquet_file = f'path_to_output_file-{comp}.parquet' # replace with your desired output parquet file path\n",
218-
" df.to_parquet(parquet_file, engine='pyarrow', compression=comp)"
216+
"for comp in [\"snappy\", \"gzip\", \"brotli\", \"zstd\"]:\n",
217+
" parquet_file = f\"path_to_output_file-{comp}.parquet\" # replace with your desired output parquet file path\n",
218+
" df.to_parquet(parquet_file, engine=\"pyarrow\", compression=comp)"
219219
]
220220
},
221221
{

src/vdf_io/notebooks/jsonltgz_to_parquet.ipynb

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,7 @@
1717
"outputs": [],
1818
"source": [
1919
"# Importing required libraries\n",
20-
"import pandas as pd\n",
21-
"import pyarrow as pa\n",
22-
"import pyarrow.parquet as pq"
20+
"import pandas as pd"
2321
]
2422
},
2523
{
@@ -71,7 +69,7 @@
7169
"with open(jsonl_file) as f:\n",
7270
" for i, l in enumerate(f):\n",
7371
" pass\n",
74-
" print(i)\n"
72+
" print(i)"
7573
]
7674
},
7775
{
@@ -120,7 +118,7 @@
120118
}
121119
],
122120
"source": [
123-
"df.iloc[0]['url']"
121+
"df.iloc[0][\"url\"]"
124122
]
125123
},
126124
{

0 commit comments

Comments
 (0)