Merge branch 'main' into cleanup

aidan-smith · aidan-smith · commit 53f02b34cfaa · 2024-05-02T05:16:07.000-04:00
Fix scheduler test
diff --git a/.gitignore b/.gitignore
@@ -23,4 +23,5 @@ test_data/
 **/*.tbl
 
 executor_logs/*
+
 job_summary.json
diff --git a/docs/architecture.png b/docs/architecture.png
diff --git a/docs/database_scheduler_architecture.png b/docs/database_scheduler_architecture.png
diff --git a/docs/design_doc.md b/docs/design_doc.md
@@ -14,14 +14,12 @@
 
 **75% Goals:**
 - Able to break down a physical plan into a distributed QUERY plan.
-- Achieve both inter-QUERY and intra-QUERY parallelism.
-- Provide job status.
 - End-to-end correctness/performance testing framework.
+- Provide job status.
 
 **100% Goals:**
-- Implement data shuffling between QUERY stages.
+- Achieve both inter-QUERY and intra-QUERY parallelism.
 - Cost-based and dynamic priority scheduling for better fairness.
-- Data-locality optimizations.
 - Able to abort/cancel a QUERY.
 
 **125% Goals:**
@@ -30,16 +28,18 @@
 
 # Architectural Design
 
-![Project Proposal Architecture](project_proposal_arch.png "Project Proposal Architecture Diagram")
+![Project Proposal Architecture](architecture.png "Project Proposal Architecture Diagram")
+
 
 **Architectural Components:**
 - **DAG Parser:** Parses a Datafusion ExecutionPlan into a DAG of stages, where each stage consists of tasks that can be completed without shuffling intermediate results. After decomposing the work, it then enqueues tasks into a work queue in a breadth-first manner.
-- **Work Queue:** A concurrent queue (initially FIFO) where tasks are enqueued by the DAG Parser. Each QUERY submitted by the optimizer also has a cost, allowing for heuristic adjustments to the ordering.
+- **Work Queue:** A concurrent queue where tasks are enqueued by the DAG Parser. Each QUERY submitted by the optimizer also has a cost, allowing for heuristic adjustments to the ordering.
 - **Work Threads (tokio):** Tokio threads are created for each executor node to handle communications.
 - **QueryID Table:** An in-memory data structure mapping QueryIDs to a DAG of remaining QUERY fragments and cost estimates retrieved from the optimizer.
 - **Executors:** Each executor is connected to the scheduler and the other executors via gRPC (tonic).
-- **Intermediate Results**: Intermediate results are stored as a thread-safe HashMap<TaskKey, Vec<RecordBatch> in shared memory. All executors will be able to access intermediate results without having to serialize/deserialize data.
+- **Intermediate Results**: Intermediate results are stored as a thread-safe hashmap in shared memory. All executors will be able to access intermediate results without having to serialize/deserialize data.
 
+![Task Dispatch Loop](task_dispatch_loop.png "Task Dispatch Loop")
 **Workflow:**
 1. Receives Datafusion ExecutionPlans from Query Optimizer and parses them into DAG, then stores in QueryID Table.
 2. Leaves of DAG are added to work queue that work threads can pull from.
@@ -62,6 +62,8 @@ Individual components within the scheduler will be unit tested using Rust’s te
 
 The end-to-end testing framework is composed of three primary components: the mock frontend, the mock catalog, and the mock executors.
 
+
+![E2E Testing Architecture](e2e-testing-arch.png)
 ### 1. Frontend
 The `MockFrontend` class is responsible for:
  - Establishing and maintaining a connection with the scheduler.
@@ -106,12 +108,12 @@ These consist of DataFusion executors and gRPC clients that execute tasks, recei
 ### Performance Benchmarking
 To assess the scheduler's capacity to handle complex OLAP queries and maintain high throughput, we intend to use the integration test framework to simultaneously submit all 22 TPC-H queries across a cluster of executors. We will collect the following metrics:
 
+- **Speedup from Scaling Out Executors**: Measure the speedup gained from increasing the number of executors.
 - **Execution Time for Each Query**: Measure the duration from submission to completion for each query.
 - **Busy-Idle Time Ratio for Each Executor**: Record periods of activity and inactivity for each executor throughout the test.
 
 Additionally, we plan to develop data visualization tools in Python to present the results more effectively.
 
-![E2E Testing Architecture](e2e_testing_arch.png)
 
 ## Future Composability with Other Components
 The mock optimizer and executor can be directly replaced with alternative implementations without necessitating any additional changes to the system. While the catalog, cache, and storage layers are not directly integrated into the testing system, we plan to encapsulate most of the logic within the mock catalog to simplify future integration.
diff --git a/docs/task_dispatch_loop.png b/docs/task_dispatch_loop.png
diff --git a/src/bin/server.rs b/src/bin/server.rs
@@ -1,10 +1,9 @@
-use tonic::transport::Server;
 use scheduler2::composable_database::scheduler_api_server::SchedulerApiServer;
 use scheduler2::server::SchedulerService;
+use tonic::transport::Server;
 
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
-
     let addr = "0.0.0.0:15721".parse().unwrap();
 
     let catalog_path = concat!(env!("CARGO_MANIFEST_DIR"), "/test_files/");
diff --git a/src/frontend.rs b/src/frontend.rs
@@ -48,6 +48,7 @@ use crate::composable_database::scheduler_api_client::SchedulerApiClient;
 use crate::composable_database::QueryJobStatusArgs;
 use crate::composable_database::QueryStatus;
 use crate::composable_database::QueryStatus::InProgress;
+use crate::intermediate_results::{get_results, TaskKey};
 use crate::mock_catalog::load_catalog;
 use crate::mock_optimizer::Optimizer;
 use crate::parser::ExecutionPlanParser;
@@ -56,7 +57,6 @@ use datafusion::logical_expr::LogicalPlan;
 use datafusion::prelude::SessionContext;
 use serde::{Deserialize, Serialize};
 use tonic::Request;
-use crate::intermediate_results::{get_results, TaskKey};
 
 #[derive(Clone, Serialize, Deserialize)]
 pub struct JobInfo {
@@ -329,10 +329,15 @@ impl MockFrontend {
                     //         }
                     //     };
 
-                    let results =  get_results(&TaskKey{stage_id: status.stage_id, query_id: status.query_id}).await
-                        .expect("api.rs: query is done but no results in table");
+                    let results = get_results(&TaskKey {
+                        stage_id: status.stage_id,
+                        query_id: status.query_id,
+                    })
+                    .await
+                    .expect("api.rs: query is done but no results in table");
 
-                    let flattened_results: Vec<RecordBatch> = results.into_iter().flat_map(|r| r.into_iter()).collect();
+                    let flattened_results: Vec<RecordBatch> =
+                        results.into_iter().flat_map(|r| r.into_iter()).collect();
 
                     let updated_job_info = JobInfo {
                         query_id,
diff --git a/src/queue.rs b/src/queue.rs
@@ -4,11 +4,9 @@ use crate::task::{
     Task,
     TaskStatus::{self, *},
 };
-use crate::SchedulerError;
 use dashmap::DashMap;
 use datafusion::physical_plan::ExecutionPlan;
-use datafusion_proto::bytes::physical_plan_to_bytes;
-use std::collections::{BTreeMap, BTreeSet, HashMap};
+use std::collections::BTreeMap;
 use std::hash::{Hash, Hasher};
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::Arc;
@@ -150,10 +148,10 @@ impl State {
 #[cfg(test)]
 mod tests {
     use rand::Rng;
-    use std::fs;
-    use tokio::sync::{Mutex, Notify};
+    use std::{fs, time::{Duration, SystemTime}};
+    use tokio::{sync::{Mutex, Notify}, time::sleep};
 
-    use crate::parser::ExecutionPlanParser;
+    use crate::{parser::ExecutionPlanParser, query_graph::QueryGraph};
     use crate::queue::State;
     use crate::task::TaskStatus;
     use std::{cmp::min, sync::Arc};
@@ -250,4 +248,59 @@ mod tests {
         assert!(queue.size().await == 0);
         println!("Finished {:?} tasks.", nplans);
     }
+
+    // Test correctness of stride scheduling algorithm.
+    #[tokio::test]
+    async fn test_stride() {
+        let test_sql_dir = concat!(env!("CARGO_MANIFEST_DIR"), "/test_sql/");
+        let catalog_path = concat!(env!("CARGO_MANIFEST_DIR"), "/test_data/");
+        let queue = Box::new(State::new(Arc::new(Notify::new())));
+        let parser = ExecutionPlanParser::new(catalog_path).await;
+        println!("test_queue_conc: Testing files in {}", test_sql_dir);
+
+        // Generate list of all tpch plans
+        let mut physical_plans = Vec::new();
+        for file in fs::read_dir(test_sql_dir).unwrap() {
+            let path_buf = file.unwrap().path();
+            let path = path_buf.to_str().unwrap();
+            physical_plans.extend(parser.get_execution_plan_from_file(&path).await.unwrap());
+        }
+
+        println!("Got {} plans.", physical_plans.len());
+        let mut long_plans = Vec::new();
+
+        // Generate list of plans with at least `rounds` stages
+        let rounds = 5;
+        for plan in physical_plans {
+            let graph = QueryGraph::new(0, Arc::clone(&plan));
+            if graph.stages.len() >= rounds {
+                long_plans.push(plan);
+            }
+        }
+        let nplans = long_plans.len();
+
+        // Add a bunch of queries with staggered submission time
+        let start_enqueue = SystemTime::now();
+        for plan in long_plans {
+            queue
+                .add_query(Arc::clone(&plan))
+                .await;
+            sleep(Duration::from_millis(10)).await;
+        }
+        let enq_time = SystemTime::now().duration_since(start_enqueue).unwrap();
+
+        // Ensure correct order of queue
+        for rnd in 0..rounds {
+            for i in 0..nplans {
+                let (task, _) = queue.next_task().await.unwrap();
+                // Queries should be processed in order
+                assert_eq!(task.query_id, i as u64);
+                // "process" for at least as long as (max_pass - min_pass)
+                sleep(enq_time).await;
+                // Return task; update query's pass
+                queue.report_task(task, TaskStatus::Finished).await;
+                println!("(Round {}) Query {}/{} ok.", rnd + 1, i + 1, nplans);
+            }
+        }
+    }
 }
diff --git a/src/server.rs b/src/server.rs
@@ -78,7 +78,6 @@ impl SchedulerApi for SchedulerService {
 
         let plan = physical_plan_from_bytes(bytes.as_slice(), &self.ctx)
             .expect("Failed to deserialize physical plan");
-
         let qid = self.state.add_query(plan).await;
 
         let response = ScheduleQueryRet { query_id: qid };
@@ -181,8 +180,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_scheduler() {
-        let test_file = concat!(env!("CARGO_MANIFEST_DIR"), "/test_files/expr.slt");
-        let catalog_path = concat!(env!("CARGO_MANIFEST_DIR"), "/test_files/");
+        let test_file = concat!(env!("CARGO_MANIFEST_DIR"), "/test_sql/1.sql");
+        let catalog_path = concat!(env!("CARGO_MANIFEST_DIR"), "/test_data/");
         let scheduler_service = Box::new(SchedulerService::new(catalog_path).await);
         let parser = ExecutionPlanParser::new(catalog_path).await;
         println!("test_scheduler: Testing file {}", test_file);
@@ -227,29 +226,27 @@ mod tests {
                 test_file
             );
         }
-        // println!(
-        //     "test_scheduler: queued {} tasks.",
-        //     scheduler_service.queue.lock().await.size()
-        // );
+        println!(
+            "test_scheduler: queued {} tasks.",
+            scheduler_service.state.size().await
+        );
 
         // TODO: add concurrent test eventually
         let mut send_task = NotifyTaskStateArgs {
             task: None,
             success: true,
         };
         // may not terminate
-        loop {
-            let ret = scheduler_service
-                .notify_task_state(Request::new(send_task.clone()))
-                .await
-                .unwrap();
-            let NotifyTaskStateRet {
-                has_new_task,
-                task,
-                physical_plan,
-            } = ret.into_inner();
-            assert!(task.is_some());
-            send_task.task = task;
-        }
+        let ret = scheduler_service
+            .notify_task_state(Request::new(send_task.clone()))
+            .await
+            .unwrap();
+        let NotifyTaskStateRet {
+            has_new_task,
+            task,
+            physical_plan,
+        } = ret.into_inner();
+        println!("test_scheduler: Received task {:?}", task.unwrap());
+        send_task.task = task;
     }
 }