diff --git a/.gitignore b/.gitignore index 1a8a8a0..3979a0f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ .DS_Store Cargo.lock /p3 +/visualizations +/visualizations_python/__pycache__ +/visualizations_python/parameters_for_histogram.yaml \ No newline at end of file diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 03c844d..ba9e8db 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -1,6 +1,6 @@ # Architecture -This document describes the highest level architecture of the library iterative_methods_done_right_in_rust. As a tree structure, the code base looks like this: +The high level architecture of the iterative_methods crate. As a tree structure, the code base looks like this: ``` iterative_methods_done_right_in_rust @@ -21,24 +21,30 @@ If you haven't already, please see the README.md document for an introduction to The remaining sections will address the contents and structure of each directory of the tree. -## src - -In `src` the lib.rs file contains all of the code needed to use the tools in this library. Unit tests are included. Most of the code base consists of iterator adaptors. They currently include -- ReservoirIterable -- StepBy -- Tee -- TimedIterable -and their associated functions and structs. +## src +In `src`, `lib.rs` exports iterative methods (via algorithms.rs) and +utilities consisting of iterator adaptors. They currently include +- take_until +- assess +- inspect +- last +- time +- step_by +- write_yaml_documents +- enumerate +- {weighted_}reservoir_sample +and their associated implementations. Unit tests are included. ## examples -Examples demonstrating the basic functionality are provided. Currently, the examples include -- Fibonnacci -- Conjugate Gradient Method (CGIterable) +Examples demonstrating different functionality. Currently: + +- Conjugate Gradient Method (ConjugateGradient) - Weighted Reservoir Sampling +- Output to yaml, e.g., for animations ## tests -Integration tests are provided. +Some integration tests. diff --git a/COPYING.md b/COPYING.md deleted file mode 100644 index 29dc35f..0000000 --- a/COPYING.md +++ /dev/null @@ -1,3 +0,0 @@ -This project is dual-licensed under the Apache 2.0 and MIT licenses. - -You may use this code under the terms of either license. diff --git a/Cargo.toml b/Cargo.toml index fba9c87..9e14f14 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,10 @@ edition = "2018" description = "Iterative methods and associated utilities as StreamingIterators." license = "MIT OR Apache-2.0" keywords = ["math","stream","machine-learning","algorithm","optimization"] +repository = "https://github.com/daniel-vainsencher/iterative_methods_rs" +readme = "README.md" +exclude = ["/local", "/visualizations", "/p3", "/visualizations_python/__pycache__", "visualizations_python/parameters_for_histogram.yaml"] + [lib] name = "iterative_methods" path = "src/lib.rs" diff --git a/Plan.org b/Plan.org deleted file mode 100644 index 347c16c..0000000 --- a/Plan.org +++ /dev/null @@ -1,35 +0,0 @@ -- [X] 0.1 IM parity - - [X] timing adaptor -- [ ] 0.3 - - Generic "compute cost function" adaptor - - Generic "report time and cost progress" adaptor - - Explain power method as first example - - Part 1: show a single iteration, explain how matrix multiplication works - - Part 2: show animation of point going to infinity, or rotation, we want to understand what matrices do, specialize to diagonal+p.s.d. - - Part 3: implement power method the usual way, gets awkward, switch to adaptors, create animation. -- [ ] 0.5: publishable post - - [ ] Code - - [ ] Functionality going beyond IM? - - candidates: - - structured logging and analysis via serde - - expanded sampling (exponential from start and end, weighted reservoir) - - expand trait: - - cost function? - - solution extraction function? - - adaptor for quantifying convergence rates? a shim around that for testing performance? - - support for frame based profiling? - - [ ] Can we add impl-style adaptors from outside the streaming-iterator crate? - - [ ] Decide format and focus - - [ ] Side post walking through the process of exploring an algorithm using the adaptors? - - [ ] Fill out content -- [ ] 0.8 publish post, get feedback. - - [ ] Clean up repo - - [ ] Publish. reddit? ndarray/ml hangouts? - - [ ] Apply feedback -- [ ] 1.0 decision point - - [ ] Decide whether this is a single post repo or we expand on the project. - - [ ] Smart timing adaptor: get desired time interval, time some - iterations, as long as time is < interval/2,double number of - iterations being timed. Report time per iterations. Definitely defer to future blog post - - diff --git a/README.md b/README.md index 5df767a..f3d68ab 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,26 @@ -This project aims to implement iterative methods and associated utilities in Rust. +The iterative_methods project implements iterative methods and +associated utilities in Rust. It currently demonstrates the following techniques we find powerful: - Implement iterative methods as StreamingIterators. -- Implement utilities useful to iterative methods as generic adaptors +- Implement iterative methods utilities as generic adaptors of StreamingIterators. - Test non-trivial methods via property testing (quickcheck). - Generic output via streaming yaml +If you're not familiar with iterative methods or what the above mean, +[start +here](https://daniel-vainsencher.github.io/book/iterative_methods_part_1.html). + Future plans: - Expand/stabilize design -- Add new iterative methods +- Add more iterative methods - Add higher level utilities - Add simple function call interface to methods. Stability/evolution: - The design is actively evolving, breakage is to be expected - everywhere. + everywhere. Feedback welcome! email us or open issues on the repo. - Some utilities (e.g., take_until) probably belong elsewhere (e.g., {Streaming}Iterator) and so might migrate entirely. diff --git a/examples/reservoir_histogram_animation.rs b/examples/reservoir_histogram_animation.rs index 99b1807..a1fc291 100644 --- a/examples/reservoir_histogram_animation.rs +++ b/examples/reservoir_histogram_animation.rs @@ -71,7 +71,7 @@ fn write_reservoir_visualizations_data_to_yaml( let stream = enumerate(stream); let stream = write_yaml_documents(stream, parameters["stream_file"].to_string()) .expect("Create File and initialize yaml iter failed."); - let stream = reservoir_iterable(stream, capacity, None); + let stream = reservoir_sample(stream, capacity, None); let stream = write_yaml_documents(stream, parameters["reservoir_samples_file"].to_string()) .expect("Create File and initialize yaml iter failed."); diff --git a/examples/reservoir_sampling_mean_convergence.rs b/examples/reservoir_sampling_mean_convergence.rs index ae5d26f..61c3e48 100644 --- a/examples/reservoir_sampling_mean_convergence.rs +++ b/examples/reservoir_sampling_mean_convergence.rs @@ -31,7 +31,7 @@ fn reservoir_sampling_mean_convergence() -> std::io::Result<()> { // Create another copy of the stream to perform reservoir sampling and write to yaml: let stream = utils::generate_enumerated_step_stream(stream_size, num_of_initial_values, 0, 1); - let res_iter = reservoir_iterable(stream, capacity, None); + let res_iter = reservoir_sample(stream, capacity, None); let reservoir_samples_file = "./target/debug/examples/reservoirs.yaml"; // Write data to file for visualization. let mut res_to_yaml = write_yaml_documents(res_iter, reservoir_samples_file.to_string()) diff --git a/examples/weighted_reservoir_sampling.rs b/examples/weighted_reservoir_sampling.rs index a37f9db..ec56f37 100644 --- a/examples/weighted_reservoir_sampling.rs +++ b/examples/weighted_reservoir_sampling.rs @@ -5,7 +5,7 @@ use streaming_iterator::*; /// Utility function to generate a sequence of (float, int as float) /// values wrapped in a WeightedDatum struct that will be used in tests -/// of ReservoirIterable. +/// of ReservoirSample. fn generate_seeded_values(num_values: usize, int_range_bound: usize) -> Vec> { let mut prng = Pcg64::seed_from_u64(1); let mut seeded_values: Vec> = Vec::new(); @@ -32,7 +32,7 @@ fn wrs_demo() { println!("Random Numbers for Alg: \n (The values are used as the probabilities and the weights as indices.) \n {:#?} \n ", probability_and_index); let stream = convert(stream); - let mut stream = weighted_reservoir_iterable(stream, 2, Some(Pcg64::seed_from_u64(1))); + let mut stream = weighted_reservoir_sample(stream, 2, Some(Pcg64::seed_from_u64(1))); println!("Reservoir - initially empty: \n {:#?} \n", stream.reservoir); let mut _index = 0usize; while let Some(reservoir) = stream.next() { diff --git a/examples/wrs_histogram_animation.rs b/examples/wrs_histogram_animation.rs index 1983b07..9a30d14 100644 --- a/examples/wrs_histogram_animation.rs +++ b/examples/wrs_histogram_animation.rs @@ -82,7 +82,7 @@ fn write_wrs_visualizations_data_to_yaml( .expect("Create File and initialize yaml iter failed."); // Add constant weights to all items let stream = wd_iterable(stream, |_x| 1.0f64); - let stream = weighted_reservoir_iterable(stream, capacity, None); + let stream = weighted_reservoir_sample(stream, capacity, None); // remove the weights, which were only needed for applying WRS. let stream = stream.map(|x| { let x: Vec> = x.iter().map(|wd| wd.value.clone()).collect(); @@ -172,7 +172,14 @@ fn main() -> std::io::Result<()> { let (visualize, stream_size, capacity) = set_visualization_parameters(); remove_yaml_files()?; write_wrs_visualizations_data_to_yaml(stream_size, capacity)?; - println!("Data is written to yaml files."); + println!( + "Animation data files saved at: + ./target/debug/examples/population_for_histogram.yaml + ./target/debug/examples/reservoirs_for_histogram.yaml + ./target/debug/examples/reservoir_means.yaml + ./target/debug/examples/stream_for_histogram.yaml + " + ); if visualize { make_visualization_in_python( "./visualizations_python/reservoir_histograms_initial_final.py", @@ -186,15 +193,7 @@ fn main() -> std::io::Result<()> { "./visualizations_python/reservoir_histogram_animation.py", "Animation of reservoir and stream histograms", )?; - } else { - println!( - "The following .yaml files have been created:\n - ./target/debug/examples/population_for_histogram.yaml \n - ./target/debug/examples/reservoirs_for_histogram.yaml \n - ./target/debug/examples/reservoir_means.yaml \n - ./target/debug/examples/stream_for_histogram.yaml \n - " - ); + println!("Animations saved at visualizations/*.html") } Ok(()) } diff --git a/pull_request_template.md b/pull_request_template.md index ad4fdfc..ef43f9f 100644 --- a/pull_request_template.md +++ b/pull_request_template.md @@ -1,17 +1,17 @@ # Intent: -- [ ] Explain here what the goal is, so reviewer can read the implementation and judge whether that goal is achieved. + # Validation: -- [ ] Are changes covered by tests so we know existing functionality is not broken? -- [ ] Is the new functionality covered by tests? -- [ ] For functionality that is impractical to test - - [ ] is there a demo? - - [ ] does it look like you'd expect? + # State of PR -- [ ] Ready to merge on master -- [ ] CI passes -- [ ] Code is documented via rustdoc commments for readers post-landing -- [ ] Changes that need explanation pre-landing (why make the change) have self-review comments + +- [ ] Rustdoc and implementation comments? +- [ ] Self review comments on non-obvious changes? +- [ ] It seems to you ready to merge on master? + diff --git a/src/lib.rs b/src/lib.rs index e7bee09..3f4b9aa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -603,7 +603,7 @@ where } /// An optimal reservoir sampling algorithm is implemented. -/// `ReservoirIterable` wraps a `StreamingIterator`, `I` and +/// `ReservoirSample` wraps a `StreamingIterator`, `I` and /// produces a `StreamingIterator` whose items are samples of size `capacity` /// from the stream of `I`. (This is not the capacity of the `Vec` which holds the `reservoir`; /// Rather, the length of the `reservoir` is normally referred to as its `capacity`.) @@ -619,7 +619,7 @@ where /// https://dl.acm.org/doi/abs/10.1145/198429.198435 #[derive(Debug, Clone)] -pub struct ReservoirIterable { +pub struct ReservoirSample { it: I, pub reservoir: Vec, capacity: usize, @@ -629,12 +629,12 @@ pub struct ReservoirIterable { } /// An adaptor for which the items are random samples of the underlying iterator up to the item processed. -/// The constructor for ReservoirIterable. -pub fn reservoir_iterable( +/// The constructor for ReservoirSample. +pub fn reservoir_sample( it: I, capacity: usize, custom_rng: Option, -) -> ReservoirIterable +) -> ReservoirSample where I: Sized + StreamingIterator, T: Clone, @@ -645,7 +645,7 @@ where }; let res: Vec = Vec::new(); let w_initial = (rng.gen::().ln() / (capacity as f64)).exp(); - ReservoirIterable { + ReservoirSample { it, reservoir: res, capacity, @@ -655,7 +655,7 @@ where } } -impl StreamingIterator for ReservoirIterable +impl StreamingIterator for ReservoirSample where T: Clone + std::fmt::Debug, I: StreamingIterator, @@ -805,7 +805,7 @@ where } /// The weighted reservoir sampling algorithm of M. T. Chao is implemented. -/// `WeightedReservoirIterable` wraps a `StreamingIterator`, `I`, whose items must be of type `WeightedDatum` and +/// `WeightedReservoirSample` wraps a `StreamingIterator`, `I`, whose items must be of type `WeightedDatum` and /// produces a `StreamingIterator` whose items are samples of size `capacity` /// from the stream of `I`. (This is not the capacity of the `Vec` which holds the `reservoir`; /// Rather, the length of the `reservoir` is normally referred to as its `capacity`.) @@ -824,7 +824,7 @@ where /// https://dl.acm.org/doi/10.1145/3350755.3400287 #[derive(Debug, Clone)] -pub struct WeightedReservoirIterable { +pub struct WeightedReservoirSample { it: I, pub reservoir: Vec>, capacity: usize, @@ -833,11 +833,11 @@ pub struct WeightedReservoirIterable { } /// Create a random sample of the underlying weighted stream. -pub fn weighted_reservoir_iterable( +pub fn weighted_reservoir_sample( it: I, capacity: usize, custom_rng: Option, -) -> WeightedReservoirIterable +) -> WeightedReservoirSample where I: Sized + StreamingIterator>, T: Clone, @@ -847,7 +847,7 @@ where None => Pcg64::from_entropy(), }; let reservoir: Vec> = Vec::new(); - WeightedReservoirIterable { + WeightedReservoirSample { it, reservoir, capacity, @@ -856,7 +856,7 @@ where } } -impl StreamingIterator for WeightedReservoirIterable +impl StreamingIterator for WeightedReservoirSample where T: Clone + std::fmt::Debug, I: StreamingIterator>, @@ -1113,7 +1113,7 @@ mod tests { assert_eq!("---\n- - 0\n - 3\n- - 1\n - 6\n- - 2\n - 9\n---\n- - 0\n - 5\n- - 1\n - 10\n- - 2\n - 15\n", &contents); } - /// Tests for the ReservoirIterable adaptor + /// Tests for the ReservoirSample adaptor /// /// This test asserts that the reservoir is filled with the correct items. #[test] @@ -1121,7 +1121,7 @@ mod tests { // v is the data stream. let v: Vec = vec![0.5, 0.2]; let iter = convert(v); - let mut iter = reservoir_iterable(iter, 2, None); + let mut iter = reservoir_sample(iter, 2, None); if let Some(reservoir) = iter.next() { assert_eq!(reservoir[0], 0.5); assert_eq!(reservoir[1], 0.2); @@ -1141,7 +1141,7 @@ mod tests { let final_stream = iter::repeat(1).take(stream_length - capacity); let stream = initial_stream.chain(final_stream); let stream = convert(stream); - let mut res_iter = reservoir_iterable(stream, capacity, None); + let mut res_iter = reservoir_sample(stream, capacity, None); if let Some(reservoir) = res_iter.next() { println!("Initial reservoir: \n {:#?} \n", reservoir); assert!(reservoir.into_iter().all(|x| *x == 0)); @@ -1162,7 +1162,7 @@ mod tests { assert!(final_reservoir.into_iter().sum::() >= 4); } - /// Tests for the WeightedReservoirIterable adaptor + /// Tests for the WeightedReservoirSample adaptor #[test] fn test_datum_struct() { let samp = new_datum(String::from("hi"), 1.0); @@ -1184,7 +1184,7 @@ mod tests { // v is the data stream. let v: Vec> = vec![new_datum(0.5, 1.), new_datum(0.2, 2.)]; let iter = convert(v); - let mut iter = weighted_reservoir_iterable(iter, 2, None); + let mut iter = weighted_reservoir_sample(iter, 2, None); if let Some(reservoir) = iter.next() { assert_eq!( reservoir[0], @@ -1207,7 +1207,7 @@ mod tests { fn stream_smaller_than_weighted_reservoir_test() { let stream_vec = vec![new_datum(1, 1.0), new_datum(2, 1.0)]; let stream = convert(stream_vec); - let mut stream = weighted_reservoir_iterable(stream, 3, None); + let mut stream = weighted_reservoir_sample(stream, 3, None); while let Some(_reservoir) = stream.next() { println!("{:#?}", _reservoir); } @@ -1316,7 +1316,7 @@ mod tests { 1, ); let stream = convert(stream); - let mut wrs_iter = weighted_reservoir_iterable(stream, capacity, None); + let mut wrs_iter = weighted_reservoir_sample(stream, capacity, None); if let Some(reservoir) = wrs_iter.next() { assert!(reservoir.into_iter().all(|wd| wd.value == 0)); }; @@ -1359,7 +1359,7 @@ mod tests { 1, ); let stream = convert(stream); - let mut wrs_iter = weighted_reservoir_iterable(stream, capacity, None); + let mut wrs_iter = weighted_reservoir_sample(stream, capacity, None); if let Some(reservoir) = wrs_iter.next() { assert!(reservoir.into_iter().all(|wd| wd.value == 0)); }; diff --git a/src/utils.rs b/src/utils.rs index e1c12f6..a4ac58c 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -84,7 +84,7 @@ pub fn mean_of_means_of_step_stream() -> f64 { final_value, ); let stream = wd_iterable(stream, |_x| 1f64); - let stream = weighted_reservoir_iterable(stream, capacity, None); + let stream = weighted_reservoir_sample(stream, capacity, None); let res = last(stream).unwrap(); let res: Vec = res.iter().map(|x| x.value).collect(); let mean = res.iter().sum::() as f64 / capacity as f64; @@ -132,7 +132,7 @@ pub fn generate_stream_from_normal_distribution( /// Utility Functions for Weighted Reservoir Sampling -/// utility function for testing ReservoirIterable +/// utility function for testing ReservoirSample pub fn generate_stream_with_constant_probability( stream_length: usize, capacity: usize, diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 4bd0170..ef5e1a5 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -35,7 +35,10 @@ fn test_timed_iterable() { let start_times = rcarr1(&start_times).map(|i| *i as f64); let st_diff = &start_times.slice(s![1..]) - &start_times.slice(s![..-1]); println!("start time diffs: {:?}", st_diff); - // Ensure times are within factor 10 of typical value observed in dev + // Ensure times are within factor 10 of typical value observed in + // dev. + // This is unfortunately not consistently true in CI envs, hence + // the test is ignored. assert!(durations.iter().all(|dur| 3000 < *dur && *dur < 300000)); // Ensure that start times are strictly increasing. assert!(st_diff.iter().all(|diff| *diff >= 0.)); @@ -67,7 +70,7 @@ fn wd_iterable_extract_value_test() { } } -/// Test the integration of ReservoirIterable, Enumerate, and ToFileIterable. +/// Test the integration of ReservoirSample, Enumerate, and ToFileIterable. /// /// A stream of 2 zeros and 8 ones subjected to reservoir sampling using a seeded rng. /// The stream of reservoirs is adapted with enumerate() and then write_yaml_documents(). After @@ -80,7 +83,7 @@ fn enumerate_reservoirs_to_yaml_test() { let initial_value = 0i64; let final_value = 1i64; let stream = generate_step_stream(stream_length, capacity, initial_value, final_value); - let stream = reservoir_iterable(stream, capacity, Some(Pcg64::seed_from_u64(0))); + let stream = reservoir_sample(stream, capacity, Some(Pcg64::seed_from_u64(0))); let stream = enumerate(stream); let mut stream = write_yaml_documents(stream, String::from(test_file_path)) .expect("Write scalar to Yaml: Create file and initialize Yaml iter failed."); diff --git a/visualizations_python/util.py b/visualizations_python/util.py index a6dc2f6..6374aac 100644 --- a/visualizations_python/util.py +++ b/visualizations_python/util.py @@ -8,14 +8,13 @@ 0) If you don't already have it, install Python3 following the instructions at https://www.python.org/downloads/. 1) Install pip and virtual env according to the instructions here: - https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#:~:text=Installing%20virtualenv&text=venv%20is%20included%20in%20the,Python%20packages%20for%20different%20projects. 2) Set up a virtual environment that will contain the dependencies: - `$ virtualenv ` + `$ virtualenv p3` 3) Activate the environment: - `$ source /bin/activate` + `$ source p3/bin/activate` 4) Install the requirements using the requirements.txt file: `$ pip install -r ./visualizations_python/requirements.txt` @@ -25,4 +24,4 @@ To run the example without visualizations, add the command line argument 'false': `$ ./target/debug/examples/reservoir_histogram_animation false` - """ \ No newline at end of file + """