AI-Northstar-Tech
diff --git a/‎src/vdf_io/notebooks/01_Introducing_txtai.ipynb‎
Lines changed: 1115 additions & 1058 deletions b/‎src/vdf_io/notebooks/01_Introducing_txtai.ipynb‎
Lines changed: 1115 additions & 1058 deletions
diff --git a/‎src/vdf_io/notebooks/aiven-qs.ipynb‎
Lines changed: 5 additions & 2 deletions b/‎src/vdf_io/notebooks/aiven-qs.ipynb‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/vdf_io/notebooks/astra_usage.ipynb‎
Lines changed: 12 additions & 13 deletions b/‎src/vdf_io/notebooks/astra_usage.ipynb‎
Lines changed: 12 additions & 13 deletions
diff --git a/‎src/vdf_io/notebooks/chroma-qs.ipynb‎
Lines changed: 21 additions & 18 deletions b/‎src/vdf_io/notebooks/chroma-qs.ipynb‎
Lines changed: 21 additions & 18 deletions
diff --git a/‎src/vdf_io/notebooks/deeplake.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎src/vdf_io/notebooks/deeplake.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/vdf_io/notebooks/json_pandas.ipynb‎
Lines changed: 6 additions & 14 deletions b/‎src/vdf_io/notebooks/json_pandas.ipynb‎
Lines changed: 6 additions & 14 deletions
diff --git a/‎src/vdf_io/notebooks/jsonl_to_parquet.ipynb‎
Lines changed: 9 additions & 9 deletions b/‎src/vdf_io/notebooks/jsonl_to_parquet.ipynb‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎src/vdf_io/notebooks/jsonltgz_to_parquet.ipynb‎
Lines changed: 3 additions & 5 deletions b/‎src/vdf_io/notebooks/jsonltgz_to_parquet.ipynb‎
Lines changed: 3 additions & 5 deletions
@@ -33,7 +33,7 @@
    },
    "outputs": [],
    "source": [
-    "from rich import print as rprint\n"
+    "from rich import print as rprint"
    ]
   },
   {
@@ -181,7 +181,10 @@
     "auth_provider = PlainTextAuthProvider(\n",
     "    os.environ.get(\"CASSANDRA_USER\"), os.environ.get(\"CASSANDRA_PASSWORD\")\n",
     ")\n",
-    "ssl_options = {\"ca_certs\": \"/Users/dhruvanand/Code/vector-io/aiven.pem\", \"cert_reqs\": ssl.CERT_REQUIRED}\n",
+    "ssl_options = {\n",
+    "    \"ca_certs\": \"/Users/dhruvanand/Code/vector-io/aiven.pem\",\n",
+    "    \"cert_reqs\": ssl.CERT_REQUIRED,\n",
+    "}\n",
     "CASSANDRA_URI = os.environ.get(\"CASSANDRA_URI\")\n",
     "CASSANDRA_URI, CASSANDRA_PORT = CASSANDRA_URI.split(\":\")\n",
     "with Cluster(\n",
 
@@ -162,7 +162,7 @@
    },
    "outputs": [],
    "source": [
-    "table = coll.find()['data']['documents']"
+    "table = coll.find()[\"data\"][\"documents\"]"
    ]
   },
   {
@@ -325,6 +325,7 @@
    "source": [
     "# convert list of dicts to pd.DataFrame\n",
     "import pandas as pd\n",
+    "\n",
     "df = pd.DataFrame(table)\n",
     "df.head()"
    ]
@@ -359,7 +360,7 @@
     }
    ],
    "source": [
-    "len(resp['data']['documents'])"
+    "len(resp[\"data\"][\"documents\"])"
    ]
   },
   {
@@ -379,7 +380,7 @@
     }
    ],
    "source": [
-    "resp['data']['documents'][0].keys()"
+    "resp[\"data\"][\"documents\"][0].keys()"
    ]
   },
   {
@@ -497,14 +498,12 @@
    "source": [
     "# write to a new parquet file\n",
     "import pyarrow as pa\n",
-    "import pyarrow.parquet as pq\n",
     "\n",
     "table = pa.Table.from_pandas(coll.find().to_pandas())\n",
     "\n",
     "for r in coll.paginated_find():\n",
     "    print(type(r), r.keys())\n",
-    "    # append data into a parquet file\n",
-    "    "
+    "    # append data into a parquet file"
    ]
   },
   {
@@ -665,9 +664,9 @@
     }
    ],
    "source": [
-    "i=0\n",
+    "i = 0\n",
     "for r in tqdm(collection2.paginated_find()):\n",
-    "    i+=1\n",
+    "    i += 1\n",
     "print(i)"
    ]
   },
@@ -690,7 +689,7 @@
     "        break\n",
     "    next_page_state = a[\"data\"][\"nextPageState\"]\n",
     "    i += 1\n",
-    "    print(i,len(id_set), tot_docs)\n",
+    "    print(i, len(id_set), tot_docs)\n",
     "# len(a[\"data\"][\"documents\"])\n",
     "# len(id_set)"
    ]
@@ -728,7 +727,7 @@
     }
    ],
    "source": [
-    "a['data']['documents'][0]"
+    "a[\"data\"][\"documents\"][0]"
    ]
   },
   {
@@ -748,7 +747,7 @@
     }
    ],
    "source": [
-    "len(collection2.find_one()['data']['document'][\"vector\"])"
+    "len(collection2.find_one()[\"data\"][\"document\"][\"vector\"])"
    ]
   },
   {
@@ -768,8 +767,8 @@
     }
    ],
    "source": [
-    "mr=db.collection(\"movie_reviews\")\n",
-    "mr.find_one()['data']['document'].keys()"
+    "mr = db.collection(\"movie_reviews\")\n",
+    "mr.find_one()[\"data\"][\"document\"].keys()"
    ]
   },
   {
 
@@ -191,7 +191,7 @@
    },
    "outputs": [],
    "source": [
-    "import chromadb\n"
+    "import chromadb"
    ]
   },
   {
@@ -200,9 +200,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "# setup Chroma in-memory, for easy prototyping. Can add persistence easily!\n",
-    "client = chromadb.PersistentClient()\n"
+    "client = chromadb.PersistentClient()"
    ]
   },
   {
@@ -247,21 +246,23 @@
     }
    ],
    "source": [
-    "\n",
     "# Create collection. get_collection, get_or_create_collection, delete_collection also available!\n",
     "collection2 = client.get_or_create_collection(\"test\")\n",
     "\n",
     "# Add docs to the collection. Can also update and delete. Row-based API coming soon!\n",
     "collection2.add(\n",
-    "    documents=[\"This is document1\", \"This is document2\"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well\n",
-    "    metadatas=[{\"source\": \"notion\"}, {\"source\": \"google-docs\"}], # filter on these!\n",
-    "    ids=[\"doc1\", \"doc2\"], # unique for each doc\n",
-    "    embeddings=[[1,2,3], [4,5,6]] # optional, we can also embed for you\n",
+    "    documents=[\n",
+    "        \"This is document1\",\n",
+    "        \"This is document2\",\n",
+    "    ],  # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well\n",
+    "    metadatas=[{\"source\": \"notion\"}, {\"source\": \"google-docs\"}],  # filter on these!\n",
+    "    ids=[\"doc1\", \"doc2\"],  # unique for each doc\n",
+    "    embeddings=[[1, 2, 3], [4, 5, 6]],  # optional, we can also embed for you\n",
     ")\n",
     "\n",
     "# Query/search 2 most similar results. You can also .get by id\n",
     "results = collection2.query(\n",
-    "    query_embeddings=[[1,2,3]],\n",
+    "    query_embeddings=[[1, 2, 3]],\n",
     "    n_results=2,\n",
     "    # where={\"metadata_field\": \"is_equal_to_this\"}, # optional filter\n",
     "    # where_document={\"$contains\":\"search_string\"}  # optional filter\n",
@@ -304,7 +305,7 @@
    },
    "outputs": [],
    "source": [
-    "coll=client.get_collection(\"test\")"
+    "coll = client.get_collection(\"test\")"
    ]
   },
   {
@@ -335,7 +336,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "client.delete_collection(\"test\") # delete collection"
+    "client.delete_collection(\"test\")  # delete collection"
    ]
   },
   {
@@ -357,7 +358,7 @@
     }
    ],
    "source": [
-    "client.list_collections() # list collections"
+    "client.list_collections()  # list collections"
    ]
   },
   {
@@ -399,7 +400,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "pclient = chromadb.PersistentClient(\"~/.chroma4\") # persistent client\n",
+    "pclient = chromadb.PersistentClient(\"~/.chroma4\")  # persistent client\n",
     "\n",
     "pcol = pclient.create_collection(\"test3\")"
    ]
@@ -414,8 +415,8 @@
     "    documents=[\"This is document1\", \"This is document2\"],\n",
     "    metadatas=[{\"source\": \"notion\"}, {\"source\": \"google-docs\"}],\n",
     "    ids=[\"doc1\", \"doc2\"],\n",
-    "    embeddings=[[1,2,3], [4,5,6]]\n",
-    ")\n"
+    "    embeddings=[[1, 2, 3], [4, 5, 6]],\n",
+    ")"
    ]
   },
   {
@@ -479,7 +480,9 @@
    },
    "outputs": [],
    "source": [
-    "client2= chromadb.PersistentClient(\"/Users/dhruvanand/Code/vector-io/src/vdf_io/notebooks/chroma\")\n"
+    "client2 = chromadb.PersistentClient(\n",
+    "    \"/Users/dhruvanand/Code/vector-io/src/vdf_io/notebooks/chroma\"\n",
+    ")"
    ]
   },
   {
@@ -559,7 +562,7 @@
    },
    "outputs": [],
    "source": [
-    "coll2=client2.get_collection(\"vdf_2024_9-11\")"
+    "coll2 = client2.get_collection(\"vdf_2024_9-11\")"
    ]
   },
   {
@@ -653,7 +656,7 @@
     "    documents=[\"This is document1\", \"This is document2\"],\n",
     "    metadatas=[{\"source\": \"notion\"}, {\"source\": \"google-docs\"}],\n",
     "    ids=[\"doc5\", \"doc6\"],\n",
-    "    embeddings=[[1,2,3], None]\n",
+    "    embeddings=[[1, 2, 3], None],\n",
     ")"
    ]
   },
 
@@ -156,7 +156,7 @@
    "source": [
     "import deeplake\n",
     "\n",
-    "ds = deeplake.load('hub://activeloop/coco-train')"
+    "ds = deeplake.load(\"hub://activeloop/coco-train\")"
    ]
   },
   {
 
@@ -17,34 +17,26 @@
    "source": [
     "import json\n",
     "import pandas as pd\n",
-    "js=[\n",
+    "\n",
+    "js = [\n",
     "    {\n",
     "        \"id\": 1,\n",
     "        \"name\": \"John Doe\",\n",
     "        \"age\": 30,\n",
-    "        \"attribute\": {\n",
-    "            \"height\": 176,\n",
-    "            \"weight\": 80\n",
-    "        }\n",
+    "        \"attribute\": {\"height\": 176, \"weight\": 80},\n",
     "    },\n",
     "    {\n",
     "        \"id\": 2,\n",
     "        \"name\": \"Alice Smith\",\n",
     "        \"age\": 28,\n",
-    "        \"attribute\": {\n",
-    "            \"height\": 167,\n",
-    "            \"weight\": 55\n",
-    "        }\n",
+    "        \"attribute\": {\"height\": 167, \"weight\": 55},\n",
     "    },\n",
     "    {\n",
     "        \"id\": 3,\n",
     "        \"name\": \"Bob Johnson\",\n",
     "        \"age\": 35,\n",
-    "        \"attribute\": {\n",
-    "            \"height\": 192,\n",
-    "            \"weight\": 85\n",
-    "        }\n",
-    "    }\n",
+    "        \"attribute\": {\"height\": 192, \"weight\": 85},\n",
+    "    },\n",
     "]\n",
     "\n",
     "df = pd.read_json(json.dumps(js))"
 
@@ -15,9 +15,7 @@
    "outputs": [],
    "source": [
     "# Importing required libraries\n",
-    "import pandas as pd\n",
-    "import pyarrow as pa\n",
-    "import pyarrow.parquet as pq"
+    "import pandas as pd"
    ]
   },
   {
@@ -35,7 +33,7 @@
    "outputs": [],
    "source": [
     "# Load JSONL File\n",
-    "jsonl_file = '/Users/dhruvanand/Code/datasets-dumps/shard-00000.jsonl 2'\n"
+    "jsonl_file = \"/Users/dhruvanand/Code/datasets-dumps/shard-00000.jsonl 2\""
    ]
   },
   {
@@ -53,7 +51,9 @@
    "outputs": [],
    "source": [
     "# Convert JSONL to DataFrame\n",
-    "df = pd.read_json(jsonl_file, lines=True)  # Convert the loaded jsonl data into a pandas DataFrame"
+    "df = pd.read_json(\n",
+    "    jsonl_file, lines=True\n",
+    ")  # Convert the loaded jsonl data into a pandas DataFrame"
    ]
   },
   {
@@ -172,7 +172,7 @@
     }
    ],
    "source": [
-    "df['metadata'].iloc[3]"
+    "df[\"metadata\"].iloc[3]"
    ]
   },
   {
@@ -213,9 +213,9 @@
    "outputs": [],
    "source": [
     "# Save DataFrame as Parquet\n",
-    "for comp in ['snappy', 'gzip', 'brotli', 'zstd']:\n",
-    "    parquet_file = f'path_to_output_file-{comp}.parquet'  # replace with your desired output parquet file path\n",
-    "    df.to_parquet(parquet_file, engine='pyarrow', compression=comp)"
+    "for comp in [\"snappy\", \"gzip\", \"brotli\", \"zstd\"]:\n",
+    "    parquet_file = f\"path_to_output_file-{comp}.parquet\"  # replace with your desired output parquet file path\n",
+    "    df.to_parquet(parquet_file, engine=\"pyarrow\", compression=comp)"
    ]
   },
   {
 
@@ -17,9 +17,7 @@
    "outputs": [],
    "source": [
     "# Importing required libraries\n",
-    "import pandas as pd\n",
-    "import pyarrow as pa\n",
-    "import pyarrow.parquet as pq"
+    "import pandas as pd"
    ]
   },
   {
@@ -71,7 +69,7 @@
     "with open(jsonl_file) as f:\n",
     "    for i, l in enumerate(f):\n",
     "        pass\n",
-    "    print(i)\n"
+    "    print(i)"
    ]
   },
   {
@@ -120,7 +118,7 @@
     }
    ],
    "source": [
-    "df.iloc[0]['url']"
+    "df.iloc[0][\"url\"]"
    ]
   },
   {
Original file line number	Diff line number	Diff line change
`@@ -162,7 +162,7 @@`
`162`	`162`	`},`
`163`	`163`	`"outputs": [],`
`164`	`164`	`"source": [`
`165`		`- "table = coll.find()['data']['documents']"`
	`165`	`+ "table = coll.find()[\"data\"][\"documents\"]"`
`166`	`166`	`]`
`167`	`167`	`},`
`168`	`168`	`{`
`@@ -325,6 +325,7 @@`
`325`	`325`	`"source": [`
`326`	`326`	`"# convert list of dicts to pd.DataFrame\n",`
`327`	`327`	`"import pandas as pd\n",`
	`328`	`+ "\n",`
`328`	`329`	`"df = pd.DataFrame(table)\n",`
`329`	`330`	`"df.head()"`
`330`	`331`	`]`
`@@ -359,7 +360,7 @@`
`359`	`360`	`}`
`360`	`361`	`],`
`361`	`362`	`"source": [`
`362`		`- "len(resp['data']['documents'])"`
	`363`	`+ "len(resp[\"data\"][\"documents\"])"`
`363`	`364`	`]`
`364`	`365`	`},`
`365`	`366`	`{`
`@@ -379,7 +380,7 @@`
`379`	`380`	`}`
`380`	`381`	`],`
`381`	`382`	`"source": [`
`382`		`- "resp['data']['documents'][0].keys()"`
	`383`	`+ "resp[\"data\"][\"documents\"][0].keys()"`
`383`	`384`	`]`
`384`	`385`	`},`
`385`	`386`	`{`
`@@ -497,14 +498,12 @@`
`497`	`498`	`"source": [`
`498`	`499`	`"# write to a new parquet file\n",`
`499`	`500`	`"import pyarrow as pa\n",`
`500`		`- "import pyarrow.parquet as pq\n",`
`501`	`501`	`"\n",`
`502`	`502`	`"table = pa.Table.from_pandas(coll.find().to_pandas())\n",`
`503`	`503`	`"\n",`
`504`	`504`	`"for r in coll.paginated_find():\n",`
`505`	`505`	`" print(type(r), r.keys())\n",`
`506`		`- " # append data into a parquet file\n",`
`507`		`- " "`
	`506`	`+ " # append data into a parquet file"`
`508`	`507`	`]`
`509`	`508`	`},`
`510`	`509`	`{`
`@@ -665,9 +664,9 @@`
`665`	`664`	`}`
`666`	`665`	`],`
`667`	`666`	`"source": [`
`668`		`- "i=0\n",`
	`667`	`+ "i = 0\n",`
`669`	`668`	`"for r in tqdm(collection2.paginated_find()):\n",`
`670`		`- " i+=1\n",`
	`669`	`+ " i += 1\n",`
`671`	`670`	`"print(i)"`
`672`	`671`	`]`
`673`	`672`	`},`
`@@ -690,7 +689,7 @@`
`690`	`689`	`" break\n",`
`691`	`690`	`" next_page_state = a[\"data\"][\"nextPageState\"]\n",`
`692`	`691`	`" i += 1\n",`
`693`		`- " print(i,len(id_set), tot_docs)\n",`
	`692`	`+ " print(i, len(id_set), tot_docs)\n",`
`694`	`693`	`"# len(a[\"data\"][\"documents\"])\n",`
`695`	`694`	`"# len(id_set)"`
`696`	`695`	`]`
`@@ -728,7 +727,7 @@`
`728`	`727`	`}`
`729`	`728`	`],`
`730`	`729`	`"source": [`
`731`		`- "a['data']['documents'][0]"`
	`730`	`+ "a[\"data\"][\"documents\"][0]"`
`732`	`731`	`]`
`733`	`732`	`},`
`734`	`733`	`{`
`@@ -748,7 +747,7 @@`
`748`	`747`	`}`
`749`	`748`	`],`
`750`	`749`	`"source": [`
`751`		`- "len(collection2.find_one()['data']['document'][\"vector\"])"`
	`750`	`+ "len(collection2.find_one()[\"data\"][\"document\"][\"vector\"])"`
`752`	`751`	`]`
`753`	`752`	`},`
`754`	`753`	`{`
`@@ -768,8 +767,8 @@`
`768`	`767`	`}`
`769`	`768`	`],`
`770`	`769`	`"source": [`
`771`		`- "mr=db.collection(\"movie_reviews\")\n",`
`772`		`- "mr.find_one()['data']['document'].keys()"`
	`770`	`+ "mr = db.collection(\"movie_reviews\")\n",`
	`771`	`+ "mr.find_one()[\"data\"][\"document\"].keys()"`
`773`	`772`	`]`
`774`	`773`	`},`
`775`	`774`	`{`
Original file line number	Diff line number	Diff line change
`@@ -156,7 +156,7 @@`
`156`	`156`	`"source": [`
`157`	`157`	`"import deeplake\n",`
`158`	`158`	`"\n",`
`159`		`- "ds = deeplake.load('hub://activeloop/coco-train')"`
	`159`	`+ "ds = deeplake.load(\"hub://activeloop/coco-train\")"`
`160`	`160`	`]`
`161`	`161`	`},`
`162`	`162`	`{`
Original file line number	Diff line number	Diff line change
`@@ -15,9 +15,7 @@`
`15`	`15`	`"outputs": [],`
`16`	`16`	`"source": [`
`17`	`17`	`"# Importing required libraries\n",`
`18`		`- "import pandas as pd\n",`
`19`		`- "import pyarrow as pa\n",`
`20`		`- "import pyarrow.parquet as pq"`
	`18`	`+ "import pandas as pd"`
`21`	`19`	`]`
`22`	`20`	`},`
`23`	`21`	`{`
`@@ -35,7 +33,7 @@`
`35`	`33`	`"outputs": [],`
`36`	`34`	`"source": [`
`37`	`35`	`"# Load JSONL File\n",`
`38`		`- "jsonl_file = '/Users/dhruvanand/Code/datasets-dumps/shard-00000.jsonl 2'\n"`
	`36`	`+ "jsonl_file = \"/Users/dhruvanand/Code/datasets-dumps/shard-00000.jsonl 2\""`
`39`	`37`	`]`
`40`	`38`	`},`
`41`	`39`	`{`
`@@ -53,7 +51,9 @@`
`53`	`51`	`"outputs": [],`
`54`	`52`	`"source": [`
`55`	`53`	`"# Convert JSONL to DataFrame\n",`
`56`		`- "df = pd.read_json(jsonl_file, lines=True) # Convert the loaded jsonl data into a pandas DataFrame"`
	`54`	`+ "df = pd.read_json(\n",`
	`55`	`+ " jsonl_file, lines=True\n",`
	`56`	`+ ") # Convert the loaded jsonl data into a pandas DataFrame"`
`57`	`57`	`]`
`58`	`58`	`},`
`59`	`59`	`{`
`@@ -172,7 +172,7 @@`
`172`	`172`	`}`
`173`	`173`	`],`
`174`	`174`	`"source": [`
`175`		`- "df['metadata'].iloc[3]"`
	`175`	`+ "df[\"metadata\"].iloc[3]"`
`176`	`176`	`]`
`177`	`177`	`},`
`178`	`178`	`{`
`@@ -213,9 +213,9 @@`
`213`	`213`	`"outputs": [],`
`214`	`214`	`"source": [`
`215`	`215`	`"# Save DataFrame as Parquet\n",`
`216`		`- "for comp in ['snappy', 'gzip', 'brotli', 'zstd']:\n",`
`217`		`- " parquet_file = f'path_to_output_file-{comp}.parquet' # replace with your desired output parquet file path\n",`
`218`		`- " df.to_parquet(parquet_file, engine='pyarrow', compression=comp)"`
	`216`	`+ "for comp in [\"snappy\", \"gzip\", \"brotli\", \"zstd\"]:\n",`
	`217`	`+ " parquet_file = f\"path_to_output_file-{comp}.parquet\" # replace with your desired output parquet file path\n",`
	`218`	`+ " df.to_parquet(parquet_file, engine=\"pyarrow\", compression=comp)"`
`219`	`219`	`]`
`220`	`220`	`},`
`221`	`221`	`{`
Original file line number	Diff line number	Diff line change
`@@ -17,9 +17,7 @@`
`17`	`17`	`"outputs": [],`
`18`	`18`	`"source": [`
`19`	`19`	`"# Importing required libraries\n",`
`20`		`- "import pandas as pd\n",`
`21`		`- "import pyarrow as pa\n",`
`22`		`- "import pyarrow.parquet as pq"`
	`20`	`+ "import pandas as pd"`
`23`	`21`	`]`
`24`	`22`	`},`
`25`	`23`	`{`
`@@ -71,7 +69,7 @@`
`71`	`69`	`"with open(jsonl_file) as f:\n",`
`72`	`70`	`" for i, l in enumerate(f):\n",`
`73`	`71`	`" pass\n",`
`74`		`- " print(i)\n"`
	`72`	`+ " print(i)"`
`75`	`73`	`]`
`76`	`74`	`},`
`77`	`75`	`{`
`@@ -120,7 +118,7 @@`
`120`	`118`	`}`
`121`	`119`	`],`
`122`	`120`	`"source": [`
`123`		`- "df.iloc[0]['url']"`
	`121`	`+ "df.iloc[0][\"url\"]"`
`124`	`122`	`]`
`125`	`123`	`},`
`126`	`124`	`{`