diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..41dfb0f
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,456 @@
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+indent_size = 4
+indent_style = space
+insert_final_newline = true
+max_line_length = 120
+tab_width = 4
+ij_continuation_indent_size = 8
+ij_formatter_off_tag = @formatter:off
+ij_formatter_on_tag = @formatter:on
+ij_formatter_tags_enabled = true
+ij_smart_tabs = false
+ij_visual_guides = 
+ij_wrap_on_typing = false
+
+[.editorconfig]
+ij_editorconfig_align_group_field_declarations = false
+ij_editorconfig_space_after_colon = false
+ij_editorconfig_space_after_comma = true
+ij_editorconfig_space_before_colon = false
+ij_editorconfig_space_before_comma = false
+ij_editorconfig_spaces_around_assignment_operators = true
+
+[{*.c,*.c++,*.c++m,*.cc,*.ccm,*.cp,*.cpp,*.cppm,*.cu,*.cuh,*.cxx,*.cxxm,*.h,*.h++,*.hh,*.hp,*.hpp,*.hxx,*.i,*.icc,*.ii,*.inl,*.ino,*.ipp,*.ixx,*.m,*.mm,*.mxx,*.pch,*.tcc,*.tpp}]
+ij_c_add_brief_tag = false
+ij_c_add_getter_prefix = true
+ij_c_add_setter_prefix = true
+ij_c_align_dictionary_pair_values = false
+ij_c_align_group_field_declarations = false
+ij_c_align_init_list_in_columns = true
+ij_c_align_multiline_array_initializer_expression = true
+ij_c_align_multiline_assignment = true
+ij_c_align_multiline_binary_operation = true
+ij_c_align_multiline_chained_methods = false
+ij_c_align_multiline_for = true
+ij_c_align_multiline_ternary_operation = true
+ij_c_array_initializer_comma_on_next_line = false
+ij_c_array_initializer_new_line_after_left_brace = false
+ij_c_array_initializer_right_brace_on_new_line = false
+ij_c_array_initializer_wrap = normal
+ij_c_assignment_wrap = off
+ij_c_binary_operation_sign_on_next_line = false
+ij_c_binary_operation_wrap = normal
+ij_c_blank_lines_after_class_header = 0
+ij_c_blank_lines_after_imports = 1
+ij_c_blank_lines_around_class = 1
+ij_c_blank_lines_around_field = 0
+ij_c_blank_lines_around_field_in_interface = 0
+ij_c_blank_lines_around_method = 1
+ij_c_blank_lines_around_method_in_interface = 1
+ij_c_blank_lines_around_namespace = 0
+ij_c_blank_lines_around_properties_in_declaration = 0
+ij_c_blank_lines_around_properties_in_interface = 0
+ij_c_blank_lines_before_imports = 1
+ij_c_blank_lines_before_method_body = 0
+ij_c_block_brace_placement = end_of_line
+ij_c_block_brace_style = end_of_line
+ij_c_block_comment_at_first_column = true
+ij_c_catch_on_new_line = false
+ij_c_class_brace_style = end_of_line
+ij_c_class_constructor_init_list_align_multiline = true
+ij_c_class_constructor_init_list_comma_on_next_line = false
+ij_c_class_constructor_init_list_new_line_after_colon = never
+ij_c_class_constructor_init_list_new_line_before_colon = if_long
+ij_c_class_constructor_init_list_wrap = normal
+ij_c_copy_is_deep = false
+ij_c_create_interface_for_categories = true
+ij_c_declare_generated_methods = true
+ij_c_description_include_member_names = true
+ij_c_discharged_short_ternary_operator = false
+ij_c_do_not_add_breaks = false
+ij_c_do_while_brace_force = never
+ij_c_else_on_new_line = false
+ij_c_enum_constants_comma_on_next_line = false
+ij_c_enum_constants_wrap = on_every_item
+ij_c_for_brace_force = never
+ij_c_for_statement_new_line_after_left_paren = false
+ij_c_for_statement_right_paren_on_new_line = false
+ij_c_for_statement_wrap = off
+ij_c_function_brace_placement = end_of_line
+ij_c_function_call_arguments_align_multiline = true
+ij_c_function_call_arguments_align_multiline_pars = false
+ij_c_function_call_arguments_comma_on_next_line = false
+ij_c_function_call_arguments_new_line_after_lpar = false
+ij_c_function_call_arguments_new_line_before_rpar = false
+ij_c_function_call_arguments_wrap = normal
+ij_c_function_non_top_after_return_type_wrap = normal
+ij_c_function_parameters_align_multiline = true
+ij_c_function_parameters_align_multiline_pars = false
+ij_c_function_parameters_comma_on_next_line = false
+ij_c_function_parameters_new_line_after_lpar = false
+ij_c_function_parameters_new_line_before_rpar = false
+ij_c_function_parameters_wrap = normal
+ij_c_function_top_after_return_type_wrap = normal
+ij_c_generate_additional_eq_operators = true
+ij_c_generate_additional_rel_operators = true
+ij_c_generate_class_constructor = true
+ij_c_generate_comparison_operators_use_std_tie = false
+ij_c_generate_instance_variables_for_properties = ask
+ij_c_generate_operators_as_members = true
+ij_c_header_guard_style_pattern = ${PROJECT_NAME}_${FILE_NAME}_${EXT}
+ij_c_if_brace_force = never
+ij_c_in_line_short_ternary_operator = true
+ij_c_indent_block_comment = true
+ij_c_indent_c_struct_members = 4
+ij_c_indent_case_from_switch = true
+ij_c_indent_class_members = 4
+ij_c_indent_directive_as_code = false
+ij_c_indent_implementation_members = 0
+ij_c_indent_inside_code_block = 4
+ij_c_indent_interface_members = 0
+ij_c_indent_interface_members_except_ivars_block = false
+ij_c_indent_namespace_members = 4
+ij_c_indent_preprocessor_directive = 0
+ij_c_indent_visibility_keywords = 0
+ij_c_insert_override = true
+ij_c_insert_virtual_with_override = false
+ij_c_introduce_auto_consts = false
+ij_c_introduce_auto_vars = false
+ij_c_introduce_const_params = false
+ij_c_introduce_const_vars = false
+ij_c_introduce_constexpr_consts = false
+ij_c_introduce_generate_property = false
+ij_c_introduce_generate_synthesize = true
+ij_c_introduce_globals_to_header = true
+ij_c_introduce_prop_to_private_category = false
+ij_c_introduce_static_consts = true
+ij_c_introduce_use_ns_types = false
+ij_c_ivars_prefix = _
+ij_c_ivars_suffix = 
+ij_c_keep_blank_lines_before_end = 2
+ij_c_keep_blank_lines_before_right_brace = 2
+ij_c_keep_blank_lines_in_code = 2
+ij_c_keep_blank_lines_in_declarations = 2
+ij_c_keep_case_expressions_in_one_line = false
+ij_c_keep_control_statement_in_one_line = true
+ij_c_keep_directive_at_first_column = true
+ij_c_keep_first_column_comment = true
+ij_c_keep_line_breaks = true
+ij_c_keep_nested_namespaces_in_one_line = false
+ij_c_keep_simple_blocks_in_one_line = true
+ij_c_keep_simple_methods_in_one_line = true
+ij_c_keep_structures_in_one_line = false
+ij_c_lambda_capture_list_align_multiline = false
+ij_c_lambda_capture_list_align_multiline_bracket = false
+ij_c_lambda_capture_list_comma_on_next_line = false
+ij_c_lambda_capture_list_new_line_after_lbracket = false
+ij_c_lambda_capture_list_new_line_before_rbracket = false
+ij_c_lambda_capture_list_wrap = off
+ij_c_line_comment_add_space = false
+ij_c_line_comment_at_first_column = true
+ij_c_method_brace_placement = end_of_line
+ij_c_method_call_arguments_align_by_colons = true
+ij_c_method_call_arguments_align_multiline = false
+ij_c_method_call_arguments_special_dictionary_pairs_treatment = true
+ij_c_method_call_arguments_wrap = off
+ij_c_method_call_chain_wrap = off
+ij_c_method_parameters_align_by_colons = true
+ij_c_method_parameters_align_multiline = false
+ij_c_method_parameters_wrap = off
+ij_c_namespace_brace_placement = end_of_line
+ij_c_parentheses_expression_new_line_after_left_paren = false
+ij_c_parentheses_expression_right_paren_on_new_line = false
+ij_c_place_assignment_sign_on_next_line = false
+ij_c_property_nonatomic = true
+ij_c_put_ivars_to_implementation = true
+ij_c_refactor_compatibility_aliases_and_classes = true
+ij_c_refactor_properties_and_ivars = true
+ij_c_release_style = ivar
+ij_c_retain_object_parameters_in_constructor = true
+ij_c_semicolon_after_method_signature = false
+ij_c_shift_operation_align_multiline = true
+ij_c_shift_operation_wrap = normal
+ij_c_show_non_virtual_functions = false
+ij_c_space_after_colon = true
+ij_c_space_after_colon_in_foreach = true
+ij_c_space_after_colon_in_selector = false
+ij_c_space_after_comma = true
+ij_c_space_after_cup_in_blocks = false
+ij_c_space_after_dictionary_literal_colon = true
+ij_c_space_after_for_semicolon = true
+ij_c_space_after_init_list_colon = true
+ij_c_space_after_method_parameter_type_parentheses = false
+ij_c_space_after_method_return_type_parentheses = false
+ij_c_space_after_pointer_in_declaration = false
+ij_c_space_after_quest = true
+ij_c_space_after_reference_in_declaration = false
+ij_c_space_after_reference_in_rvalue = false
+ij_c_space_after_structures_rbrace = true
+ij_c_space_after_superclass_colon = true
+ij_c_space_after_type_cast = true
+ij_c_space_after_visibility_sign_in_method_declaration = true
+ij_c_space_before_autorelease_pool_lbrace = true
+ij_c_space_before_catch_keyword = true
+ij_c_space_before_catch_left_brace = true
+ij_c_space_before_catch_parentheses = true
+ij_c_space_before_category_parentheses = true
+ij_c_space_before_chained_send_message = true
+ij_c_space_before_class_left_brace = true
+ij_c_space_before_colon = true
+ij_c_space_before_colon_in_foreach = false
+ij_c_space_before_comma = false
+ij_c_space_before_dictionary_literal_colon = false
+ij_c_space_before_do_left_brace = true
+ij_c_space_before_else_keyword = true
+ij_c_space_before_else_left_brace = true
+ij_c_space_before_export_lbrace = true
+ij_c_space_before_for_left_brace = true
+ij_c_space_before_for_parentheses = true
+ij_c_space_before_for_semicolon = false
+ij_c_space_before_if_left_brace = true
+ij_c_space_before_if_parentheses = true
+ij_c_space_before_init_list = false
+ij_c_space_before_init_list_colon = true
+ij_c_space_before_method_call_parentheses = false
+ij_c_space_before_method_left_brace = true
+ij_c_space_before_method_parentheses = false
+ij_c_space_before_namespace_lbrace = true
+ij_c_space_before_pointer_in_declaration = true
+ij_c_space_before_property_attributes_parentheses = false
+ij_c_space_before_protocols_brackets = true
+ij_c_space_before_quest = true
+ij_c_space_before_reference_in_declaration = true
+ij_c_space_before_superclass_colon = true
+ij_c_space_before_switch_left_brace = true
+ij_c_space_before_switch_parentheses = true
+ij_c_space_before_template_call_lt = false
+ij_c_space_before_template_declaration_lt = false
+ij_c_space_before_try_left_brace = true
+ij_c_space_before_while_keyword = true
+ij_c_space_before_while_left_brace = true
+ij_c_space_before_while_parentheses = true
+ij_c_space_between_adjacent_brackets = false
+ij_c_space_between_operator_and_punctuator = false
+ij_c_space_within_empty_array_initializer_braces = false
+ij_c_spaces_around_additive_operators = true
+ij_c_spaces_around_assignment_operators = true
+ij_c_spaces_around_bitwise_operators = true
+ij_c_spaces_around_equality_operators = true
+ij_c_spaces_around_lambda_arrow = true
+ij_c_spaces_around_logical_operators = true
+ij_c_spaces_around_multiplicative_operators = true
+ij_c_spaces_around_pm_operators = false
+ij_c_spaces_around_relational_operators = true
+ij_c_spaces_around_shift_operators = true
+ij_c_spaces_around_unary_operator = false
+ij_c_spaces_within_array_initializer_braces = false
+ij_c_spaces_within_braces = true
+ij_c_spaces_within_brackets = false
+ij_c_spaces_within_cast_parentheses = false
+ij_c_spaces_within_catch_parentheses = false
+ij_c_spaces_within_category_parentheses = false
+ij_c_spaces_within_empty_braces = false
+ij_c_spaces_within_empty_function_call_parentheses = false
+ij_c_spaces_within_empty_function_declaration_parentheses = false
+ij_c_spaces_within_empty_lambda_capture_list_bracket = false
+ij_c_spaces_within_empty_template_call_ltgt = false
+ij_c_spaces_within_empty_template_declaration_ltgt = false
+ij_c_spaces_within_for_parentheses = false
+ij_c_spaces_within_function_call_parentheses = false
+ij_c_spaces_within_function_declaration_parentheses = false
+ij_c_spaces_within_if_parentheses = false
+ij_c_spaces_within_lambda_capture_list_bracket = false
+ij_c_spaces_within_method_parameter_type_parentheses = false
+ij_c_spaces_within_method_return_type_parentheses = false
+ij_c_spaces_within_parentheses = false
+ij_c_spaces_within_property_attributes_parentheses = false
+ij_c_spaces_within_protocols_brackets = false
+ij_c_spaces_within_send_message_brackets = false
+ij_c_spaces_within_structured_binding_list_bracket = false
+ij_c_spaces_within_switch_parentheses = false
+ij_c_spaces_within_template_call_ltgt = false
+ij_c_spaces_within_template_declaration_ltgt = false
+ij_c_spaces_within_template_double_gt = true
+ij_c_spaces_within_while_parentheses = false
+ij_c_special_else_if_treatment = true
+ij_c_structured_binding_list_align_multiline = false
+ij_c_structured_binding_list_align_multiline_bracket = false
+ij_c_structured_binding_list_comma_on_next_line = false
+ij_c_structured_binding_list_new_line_after_lbracket = false
+ij_c_structured_binding_list_new_line_before_rbracket = false
+ij_c_structured_binding_list_wrap = off
+ij_c_superclass_list_after_colon = never
+ij_c_superclass_list_align_multiline = true
+ij_c_superclass_list_before_colon = if_long
+ij_c_superclass_list_comma_on_next_line = false
+ij_c_superclass_list_wrap = on_every_item
+ij_c_tag_prefix_of_block_comment = at
+ij_c_tag_prefix_of_line_comment = back_slash
+ij_c_template_call_arguments_align_multiline = false
+ij_c_template_call_arguments_align_multiline_pars = false
+ij_c_template_call_arguments_comma_on_next_line = false
+ij_c_template_call_arguments_new_line_after_lt = false
+ij_c_template_call_arguments_new_line_before_gt = false
+ij_c_template_call_arguments_wrap = off
+ij_c_template_declaration_function_body_indent = false
+ij_c_template_declaration_function_wrap = split_into_lines
+ij_c_template_declaration_struct_body_indent = false
+ij_c_template_declaration_struct_wrap = split_into_lines
+ij_c_template_parameters_align_multiline = false
+ij_c_template_parameters_align_multiline_pars = false
+ij_c_template_parameters_comma_on_next_line = false
+ij_c_template_parameters_new_line_after_lt = false
+ij_c_template_parameters_new_line_before_gt = false
+ij_c_template_parameters_wrap = off
+ij_c_ternary_operation_signs_on_next_line = true
+ij_c_ternary_operation_wrap = normal
+ij_c_type_qualifiers_placement = before
+ij_c_use_modern_casts = true
+ij_c_use_setters_in_constructor = true
+ij_c_while_brace_force = never
+ij_c_while_on_new_line = false
+ij_c_wrap_property_declaration = off
+
+[{*.cmake,CMakeLists.txt}]
+ij_cmake_align_command_call_r_par = false
+ij_cmake_align_control_flow_r_par = false
+ij_cmake_align_multiline_parameters_in_calls = false
+ij_cmake_force_commands_case = 2
+ij_cmake_keep_blank_lines_in_code = 2
+ij_cmake_space_before_for_parentheses = true
+ij_cmake_space_before_if_parentheses = true
+ij_cmake_space_before_method_call_parentheses = false
+ij_cmake_space_before_method_parentheses = false
+ij_cmake_space_before_while_parentheses = true
+ij_cmake_spaces_within_for_parentheses = false
+ij_cmake_spaces_within_if_parentheses = false
+ij_cmake_spaces_within_method_call_parentheses = false
+ij_cmake_spaces_within_method_parentheses = false
+ij_cmake_spaces_within_while_parentheses = false
+
+
+[{*.kt,*.kts}]
+ij_kotlin_align_in_columns_case_branch = false
+ij_kotlin_align_multiline_binary_operation = false
+ij_kotlin_align_multiline_extends_list = false
+ij_kotlin_align_multiline_method_parentheses = false
+ij_kotlin_align_multiline_parameters = true
+ij_kotlin_align_multiline_parameters_in_calls = false
+ij_kotlin_allow_trailing_comma = false
+ij_kotlin_allow_trailing_comma_on_call_site = false
+ij_kotlin_assignment_wrap = normal
+ij_kotlin_blank_lines_after_class_header = 0
+ij_kotlin_blank_lines_around_block_when_branches = 0
+ij_kotlin_blank_lines_before_declaration_with_comment_or_annotation_on_separate_line = 1
+ij_kotlin_block_comment_add_space = false
+ij_kotlin_block_comment_at_first_column = true
+ij_kotlin_call_parameters_new_line_after_left_paren = true
+ij_kotlin_call_parameters_right_paren_on_new_line = true
+ij_kotlin_call_parameters_wrap = on_every_item
+ij_kotlin_catch_on_new_line = false
+ij_kotlin_class_annotation_wrap = split_into_lines
+ij_kotlin_code_style_defaults = KOTLIN_OFFICIAL
+ij_kotlin_continuation_indent_for_chained_calls = false
+ij_kotlin_continuation_indent_for_expression_bodies = false
+ij_kotlin_continuation_indent_in_argument_lists = false
+ij_kotlin_continuation_indent_in_elvis = false
+ij_kotlin_continuation_indent_in_if_conditions = false
+ij_kotlin_continuation_indent_in_parameter_lists = false
+ij_kotlin_continuation_indent_in_supertype_lists = false
+ij_kotlin_else_on_new_line = false
+ij_kotlin_enum_constants_wrap = off
+ij_kotlin_extends_list_wrap = normal
+ij_kotlin_field_annotation_wrap = split_into_lines
+ij_kotlin_finally_on_new_line = false
+ij_kotlin_if_rparen_on_new_line = true
+ij_kotlin_import_nested_classes = false
+ij_kotlin_imports_layout = *,java.**,javax.**,kotlin.**,^
+ij_kotlin_insert_whitespaces_in_simple_one_line_method = true
+ij_kotlin_keep_blank_lines_before_right_brace = 2
+ij_kotlin_keep_blank_lines_in_code = 2
+ij_kotlin_keep_blank_lines_in_declarations = 2
+ij_kotlin_keep_first_column_comment = true
+ij_kotlin_keep_indents_on_empty_lines = false
+ij_kotlin_keep_line_breaks = true
+ij_kotlin_lbrace_on_next_line = false
+ij_kotlin_line_break_after_multiline_when_entry = true
+ij_kotlin_line_comment_add_space = false
+ij_kotlin_line_comment_add_space_on_reformat = false
+ij_kotlin_line_comment_at_first_column = true
+ij_kotlin_method_annotation_wrap = split_into_lines
+ij_kotlin_method_call_chain_wrap = normal
+ij_kotlin_method_parameters_new_line_after_left_paren = true
+ij_kotlin_method_parameters_right_paren_on_new_line = true
+ij_kotlin_method_parameters_wrap = on_every_item
+ij_kotlin_name_count_to_use_star_import = 5
+ij_kotlin_name_count_to_use_star_import_for_members = 3
+ij_kotlin_packages_to_use_import_on_demand = java.util.*,kotlinx.android.synthetic.**,io.ktor.**
+ij_kotlin_parameter_annotation_wrap = off
+ij_kotlin_space_after_comma = true
+ij_kotlin_space_after_extend_colon = true
+ij_kotlin_space_after_type_colon = true
+ij_kotlin_space_before_catch_parentheses = true
+ij_kotlin_space_before_comma = false
+ij_kotlin_space_before_extend_colon = true
+ij_kotlin_space_before_for_parentheses = true
+ij_kotlin_space_before_if_parentheses = true
+ij_kotlin_space_before_lambda_arrow = true
+ij_kotlin_space_before_type_colon = false
+ij_kotlin_space_before_when_parentheses = true
+ij_kotlin_space_before_while_parentheses = true
+ij_kotlin_spaces_around_additive_operators = true
+ij_kotlin_spaces_around_assignment_operators = true
+ij_kotlin_spaces_around_equality_operators = true
+ij_kotlin_spaces_around_function_type_arrow = true
+ij_kotlin_spaces_around_logical_operators = true
+ij_kotlin_spaces_around_multiplicative_operators = true
+ij_kotlin_spaces_around_range = false
+ij_kotlin_spaces_around_relational_operators = true
+ij_kotlin_spaces_around_unary_operator = false
+ij_kotlin_spaces_around_when_arrow = true
+ij_kotlin_variable_annotation_wrap = off
+ij_kotlin_while_on_new_line = false
+ij_kotlin_wrap_elvis_expressions = 1
+ij_kotlin_wrap_expression_body_functions = 1
+ij_kotlin_wrap_first_method_in_call_chain = false
+
+[{*.markdown,*.md}]
+ij_markdown_force_one_space_after_blockquote_symbol = true
+ij_markdown_force_one_space_after_header_symbol = true
+ij_markdown_force_one_space_after_list_bullet = true
+ij_markdown_force_one_space_between_words = true
+ij_markdown_format_tables = true
+ij_markdown_insert_quote_arrows_on_wrap = true
+ij_markdown_keep_indents_on_empty_lines = false
+ij_markdown_keep_line_breaks_inside_text_blocks = true
+ij_markdown_max_lines_around_block_elements = 1
+ij_markdown_max_lines_around_header = 1
+ij_markdown_max_lines_between_paragraphs = 1
+ij_markdown_min_lines_around_block_elements = 1
+ij_markdown_min_lines_around_header = 1
+ij_markdown_min_lines_between_paragraphs = 1
+ij_markdown_wrap_text_if_long = true
+ij_markdown_wrap_text_inside_blockquotes = true
+
+[{*.properties,spring.handlers,spring.schemas}]
+ij_properties_align_group_field_declarations = false
+ij_properties_keep_blank_lines = false
+ij_properties_key_value_delimiter = equals
+ij_properties_spaces_around_key_value_delimiter = false
+
+[{*.yaml,*.yml}]
+indent_size = 2
+ij_yaml_align_values_properties = do_not_align
+ij_yaml_autoinsert_sequence_marker = true
+ij_yaml_block_mapping_on_new_line = false
+ij_yaml_indent_sequence_value = true
+ij_yaml_keep_indents_on_empty_lines = false
+ij_yaml_keep_line_breaks = true
+ij_yaml_sequence_on_new_line = false
+ij_yaml_space_before_colon = false
+ij_yaml_spaces_within_braces = true
+ij_yaml_spaces_within_brackets = true
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5dbefb8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+### IDE
+.idea/
+
+### Gradle
+.gradle/
+.kotlin/
+build/
+
+### CMake
+cmake-build-*/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..16006b4
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,26 @@
+3-Clause BSD NON-AI License
+
+Copyright 2024 Martmists
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+4. The source code and the binary form, and any modifications made to them may not be used for the purpose of training or improving machine learning algorithms,
+including but not limited to artificial intelligence, natural language processing, or data mining. This condition applies to any derivatives,
+modifications, or updates based on the Software code. Any usage of the source code or the binary form in an AI-training dataset is considered a breach of this License.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5637456
--- /dev/null
+++ b/README.md
@@ -0,0 +1,19 @@
+# NDArray.SIMD
+
+A Kotlin NDArray library with built-in SIMD support.
+
+### Installation
+
+Coming soon:tm: to my Maven repo.
+
+### Motivation
+
+I basically made this because [Viktor](https://github.com/JetBrains-Research/viktor) didn't really utilize their SIMD capabilities.
+
+As such, most of the JVM code for NDArray is mostly the same as Viktor.
+
+### License
+
+The nativeMain and jvmMain sourcesets are licensed under the [3-Clause BSD NON-AI License](https://github.com/non-ai-licenses/non-ai-licenses/blob/main/NON-AI-BSD3), with @Martmists-GH as the copyright holder.
+
+The commonMain sourceset is mostly copied/adapted from Viktor, and as such is licensed under the [original MIT license](https://github.com/JetBrains-Research/viktor/blob/master/LICENSE), with JetBrains BioLabs as the copyright holder.
diff --git a/build.gradle.kts b/build.gradle.kts
new file mode 100644
index 0000000..6f268aa
--- /dev/null
+++ b/build.gradle.kts
@@ -0,0 +1,148 @@
+import com.github.tomtzook.gcmake.tasks.CmakeBuildTask
+import org.gradle.jvm.tasks.Jar
+import org.jetbrains.kotlin.gradle.plugin.mpp.KotlinNativeTarget
+import org.jetbrains.kotlin.gradle.plugin.mpp.KotlinNativeTargetWithSimulatorTests
+import org.jetbrains.kotlin.gradle.targets.native.tasks.artifact.KotlinNativeLinkArtifactTask
+import org.jetbrains.kotlin.gradle.tasks.KotlinNativeCompile
+import org.jetbrains.kotlin.gradle.tasks.KotlinNativeLink
+
+plugins {
+    kotlin("multiplatform") version "2.0.0"
+    id("io.github.tomtzook.gradle-cmake") version "1.2.2"
+}
+
+group = "com.martmists"
+version = "1.0-SNAPSHOT"
+
+repositories {
+    mavenCentral()
+}
+
+cmake {
+    targets {
+        val simd by creating {
+            cmakeLists = file("cmake/CMakeLists.txt")
+
+            val linuxX64 by machines.customMachines.registering {
+                toolchainFile = file("cmake/toolchains/linux-x64.cmake")
+            }
+            val linuxArm64 by machines.customMachines.registering {
+                toolchainFile = file("cmake/toolchains/linux-arm64.cmake")
+            }
+            val mingwX64 by machines.customMachines.registering {
+                toolchainFile = file("cmake/toolchains/mingw-x64.cmake")
+            }
+            val mingwArm64 by machines.customMachines.registering {
+                toolchainFile = file("cmake/toolchains/mingw-arm64.cmake")
+            }
+            val macosX64 by machines.customMachines.registering {
+                toolchainFile = file("cmake/toolchains/macos-x64.cmake")
+            }
+            val macosArm64 by machines.customMachines.registering {
+                toolchainFile = file("cmake/toolchains/macos-arm64.cmake")
+            }
+
+            if (project.hasProperty("production")) {
+                targetMachines.add(linuxX64)
+                targetMachines.add(linuxArm64)
+                targetMachines.add(mingwX64)
+//                targetMachines.add(mingwArm64)
+                targetMachines.add(macosX64)
+                targetMachines.add(macosArm64)
+            } else {
+                targetMachines.add(linuxX64)
+            }
+
+            cmakeArgs = if (project.hasProperty("production")) {
+                listOf("-DCMAKE_BUILD_TYPE=Release")
+            } else {
+                listOf("-DCMAKE_BUILD_TYPE=Debug")
+            }
+        }
+    }
+}
+
+kotlin {
+    jvm()
+
+    val natives = if (project.hasProperty("production")) {
+        listOf(
+            linuxX64(),
+            linuxArm64(),
+            mingwX64(),
+//            mingwArm64(),
+            macosX64(),
+            macosArm64(),
+        )
+    } else {
+        when (System.getProperty("os.name")) {
+            "Linux" -> listOf(linuxX64())
+            "Windows" -> listOf(mingwX64())
+            "Mac OS X" -> listOf(macosX64())
+            else -> error("Unsupported OS")
+        }
+    }
+
+    for (native in natives) {
+        native.apply {
+            binaries {
+                sharedLib {
+                    baseName = "ndarray_simd"
+                }
+            }
+
+            compilerOptions {
+                optIn = listOf(
+                    "kotlin.experimental.ExperimentalNativeApi",
+                    "kotlinx.cinterop.ExperimentalForeignApi",
+                )
+            }
+
+            compilations.named("main") {
+                val jni by cinterops.creating {
+                    val javaHome = File(System.getProperty("java.home")!!)
+                    defFile(project.projectDir.resolve("src/nativeMain/cinterops/jni.def"))
+                    includeDirs(
+                        javaHome.resolve("include"),
+                        javaHome.resolve("include/linux"),
+                        javaHome.resolve("include/darwin"),
+                        javaHome.resolve("include/win32"),
+                    )
+                }
+
+                val simd by cinterops.creating {
+                    defFile(project.projectDir.resolve("src/nativeMain/cinterops/simd.def"))
+                    includeDirs(
+                        project.projectDir.resolve("src/lib"),
+                    )
+
+                    extraOpts("-libraryPath", projectDir.resolve("build/cmake/simd/${target.name}/").absolutePath)
+                }
+            }
+        }
+    }
+}
+
+tasks {
+    val cmakeBuild by existing
+
+    withType<KotlinNativeCompile> {
+        dependsOn(cmakeBuild)
+    }
+
+    val jvmProcessResources by existing(Copy::class) {
+        val binaryName = if (project.hasProperty("production")) {
+            "releaseShared"
+        } else {
+            "debugShared"
+        }
+
+        for (native in kotlin.targets.withType<KotlinNativeTarget>()) {
+            into("META-INF/natives/${native.targetName}") {
+                from(named(native.binaries.getByName(binaryName).linkTaskName)) {
+                    exclude("**/*.h")
+                }
+            }
+        }
+    }
+}
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
new file mode 100644
index 0000000..3e894cf
--- /dev/null
+++ b/cmake/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.30.0)
+project(simd)
+
+file(GLOB_RECURSE source_files ../src/lib/cpp/*.cpp)
+
+include_directories(../src/lib/public)
+
+add_library(
+    simd STATIC ${source_files}
+)
diff --git a/cmake/arch/arm64.cmake b/cmake/arch/arm64.cmake
new file mode 100644
index 0000000..0ca6d1d
--- /dev/null
+++ b/cmake/arch/arm64.cmake
@@ -0,0 +1,2 @@
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+set("CMAKE_C_FLAGS" "-march=armv8-a --no-standard-libraries")
diff --git a/cmake/arch/x64.cmake b/cmake/arch/x64.cmake
new file mode 100644
index 0000000..3f8b54f
--- /dev/null
+++ b/cmake/arch/x64.cmake
@@ -0,0 +1,2 @@
+set(CMAKE_SYSTEM_PROCESSOR x86_64)
+set("CMAKE_C_FLAGS" "-march=x86-64 --no-standard-libraries")
diff --git a/cmake/platform/linux.cmake b/cmake/platform/linux.cmake
new file mode 100644
index 0000000..3058d51
--- /dev/null
+++ b/cmake/platform/linux.cmake
@@ -0,0 +1,2 @@
+set(CMAKE_SYSTEM_NAME Linux)
+
diff --git a/cmake/platform/macos.cmake b/cmake/platform/macos.cmake
new file mode 100644
index 0000000..8c0a861
--- /dev/null
+++ b/cmake/platform/macos.cmake
@@ -0,0 +1 @@
+set(CMAKE_SYSTEM_NAME Darwin)
diff --git a/cmake/platform/windows.cmake b/cmake/platform/windows.cmake
new file mode 100644
index 0000000..572f3fb
--- /dev/null
+++ b/cmake/platform/windows.cmake
@@ -0,0 +1 @@
+set(CMAKE_SYSTEM_NAME Windows)
diff --git a/cmake/toolchains/common.cmake b/cmake/toolchains/common.cmake
new file mode 100644
index 0000000..612f58a
--- /dev/null
+++ b/cmake/toolchains/common.cmake
@@ -0,0 +1,2 @@
+set(CMAKE_C_FLAGS "-O2 -fPIC")
+set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS}")
diff --git a/cmake/toolchains/linux-arm64.cmake b/cmake/toolchains/linux-arm64.cmake
new file mode 100644
index 0000000..c2cd4cd
--- /dev/null
+++ b/cmake/toolchains/linux-arm64.cmake
@@ -0,0 +1,3 @@
+include("${CMAKE_CURRENT_LIST_DIR}/common.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/../platform/linux.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/../arch/arm64.cmake")
diff --git a/cmake/toolchains/linux-x64.cmake b/cmake/toolchains/linux-x64.cmake
new file mode 100644
index 0000000..dfae332
--- /dev/null
+++ b/cmake/toolchains/linux-x64.cmake
@@ -0,0 +1,3 @@
+include("${CMAKE_CURRENT_LIST_DIR}/common.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/../platform/linux.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/../arch/x64.cmake")
diff --git a/cmake/toolchains/macos-arm64.cmake b/cmake/toolchains/macos-arm64.cmake
new file mode 100644
index 0000000..a1e4401
--- /dev/null
+++ b/cmake/toolchains/macos-arm64.cmake
@@ -0,0 +1,3 @@
+include("${CMAKE_CURRENT_LIST_DIR}/common.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/../platform/macos.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/../arch/arm64.cmake")
diff --git a/cmake/toolchains/macos-x64.cmake b/cmake/toolchains/macos-x64.cmake
new file mode 100644
index 0000000..b2d4e5f
--- /dev/null
+++ b/cmake/toolchains/macos-x64.cmake
@@ -0,0 +1,3 @@
+include("${CMAKE_CURRENT_LIST_DIR}/common.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/../platform/macos.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/../arch/x64.cmake")
diff --git a/cmake/toolchains/mingw-arm64.cmake b/cmake/toolchains/mingw-arm64.cmake
new file mode 100644
index 0000000..d02f294
--- /dev/null
+++ b/cmake/toolchains/mingw-arm64.cmake
@@ -0,0 +1,3 @@
+include("${CMAKE_CURRENT_LIST_DIR}/common.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/../platform/windows.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/../arch/arm64.cmake")
diff --git a/cmake/toolchains/mingw-x64.cmake b/cmake/toolchains/mingw-x64.cmake
new file mode 100644
index 0000000..c611168
--- /dev/null
+++ b/cmake/toolchains/mingw-x64.cmake
@@ -0,0 +1,3 @@
+include("${CMAKE_CURRENT_LIST_DIR}/common.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/../platform/windows.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/../arch/x64.cmake")
diff --git a/gradle.properties b/gradle.properties
new file mode 100644
index 0000000..4d40fac
--- /dev/null
+++ b/gradle.properties
@@ -0,0 +1,9 @@
+org.gradle.parallel=true
+
+kotlin.code.style=official
+kotlin.native.ignoreDisabledTargets=true
+kotlin.mpp.enableCInteropCommonization=true
+kotlin.mpp.enableCInteropCommonization.nowarn=true
+kotlin.native.binary.sourceInfoType=libbacktrace
+
+kotlin.native.cacheKind.linuxX64=none
diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000..249e583
Binary files /dev/null and b/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000..d681a49
--- /dev/null
+++ b/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Mon Jul 15 14:03:33 CEST 2024
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
diff --git a/gradlew b/gradlew
new file mode 100755
index 0000000..6fea4ec
--- /dev/null
+++ b/gradlew
@@ -0,0 +1,234 @@
+#!/bin/sh
+
+#
+# Copyright © 2015-2021 the original authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+##############################################################################
+#
+#   Gradle start up script for POSIX generated by Gradle.
+#
+#   Important for running:
+#
+#   (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
+#       noncompliant, but you have some other compliant shell such as ksh or
+#       bash, then to run this script, type that shell name before the whole
+#       command line, like:
+#
+#           ksh Gradle
+#
+#       Busybox and similar reduced shells will NOT work, because this script
+#       requires all of these POSIX shell features:
+#         * functions;
+#         * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
+#           «${var#prefix}», «${var%suffix}», and «$( cmd )»;
+#         * compound commands having a testable exit status, especially «case»;
+#         * various built-in commands including «command», «set», and «ulimit».
+#
+#   Important for patching:
+#
+#   (2) This script targets any POSIX shell, so it avoids extensions provided
+#       by Bash, Ksh, etc; in particular arrays are avoided.
+#
+#       The "traditional" practice of packing multiple parameters into a
+#       space-separated string is a well documented source of bugs and security
+#       problems, so this is (mostly) avoided, by progressively accumulating
+#       options in "$@", and eventually passing that to Java.
+#
+#       Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
+#       and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
+#       see the in-line comments for details.
+#
+#       There are tweaks for specific operating systems such as AIX, CygWin,
+#       Darwin, MinGW, and NonStop.
+#
+#   (3) This script is generated from the Groovy template
+#       https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
+#       within the Gradle project.
+#
+#       You can find Gradle at https://github.com/gradle/gradle/.
+#
+##############################################################################
+
+# Attempt to set APP_HOME
+
+# Resolve links: $0 may be a link
+app_path=$0
+
+# Need this for daisy-chained symlinks.
+while
+    APP_HOME=${app_path%"${app_path##*/}"}  # leaves a trailing /; empty if no leading path
+    [ -h "$app_path" ]
+do
+    ls=$( ls -ld "$app_path" )
+    link=${ls#*' -> '}
+    case $link in             #(
+      /*)   app_path=$link ;; #(
+      *)    app_path=$APP_HOME$link ;;
+    esac
+done
+
+APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
+
+APP_NAME="Gradle"
+APP_BASE_NAME=${0##*/}
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD=maximum
+
+warn () {
+    echo "$*"
+} >&2
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+} >&2
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "$( uname )" in                #(
+  CYGWIN* )         cygwin=true  ;; #(
+  Darwin* )         darwin=true  ;; #(
+  MSYS* | MINGW* )  msys=true    ;; #(
+  NONSTOP* )        nonstop=true ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD=$JAVA_HOME/jre/sh/java
+    else
+        JAVACMD=$JAVA_HOME/bin/java
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD=java
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
+    case $MAX_FD in #(
+      max*)
+        MAX_FD=$( ulimit -H -n ) ||
+            warn "Could not query maximum file descriptor limit"
+    esac
+    case $MAX_FD in  #(
+      '' | soft) :;; #(
+      *)
+        ulimit -n "$MAX_FD" ||
+            warn "Could not set maximum file descriptor limit to $MAX_FD"
+    esac
+fi
+
+# Collect all arguments for the java command, stacking in reverse order:
+#   * args from the command line
+#   * the lib class name
+#   * -classpath
+#   * -D...appname settings
+#   * --module-path (only if needed)
+#   * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
+
+# For Cygwin or MSYS, switch paths to Windows format before running java
+if "$cygwin" || "$msys" ; then
+    APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
+    CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
+
+    JAVACMD=$( cygpath --unix "$JAVACMD" )
+
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    for arg do
+        if
+            case $arg in                                #(
+              -*)   false ;;                            # don't mess with options #(
+              /?*)  t=${arg#/} t=/${t%%/*}              # looks like a POSIX filepath
+                    [ -e "$t" ] ;;                      #(
+              *)    false ;;
+            esac
+        then
+            arg=$( cygpath --path --ignore --mixed "$arg" )
+        fi
+        # Roll the args list around exactly as many times as the number of
+        # args, so each arg winds up back in the position where it started, but
+        # possibly modified.
+        #
+        # NB: a `for` loop captures its iteration list before it begins, so
+        # changing the positional parameters here affects neither the number of
+        # iterations, nor the values presented in `arg`.
+        shift                   # remove old arg
+        set -- "$@" "$arg"      # push replacement arg
+    done
+fi
+
+# Collect all arguments for the java command;
+#   * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
+#     shell script including quotes and variable substitutions, so put them in
+#     double quotes to make sure that they get re-expanded; and
+#   * put everything else in single quotes, so that it's not re-expanded.
+
+set -- \
+        "-Dorg.gradle.appname=$APP_BASE_NAME" \
+        -classpath "$CLASSPATH" \
+        org.gradle.wrapper.GradleWrapperMain \
+        "$@"
+
+# Use "xargs" to parse quoted args.
+#
+# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
+#
+# In Bash we could simply go:
+#
+#   readarray ARGS < <( xargs -n1 <<<"$var" ) &&
+#   set -- "${ARGS[@]}" "$@"
+#
+# but POSIX shell has neither arrays nor command substitution, so instead we
+# post-process each arg (as a line of input to sed) to backslash-escape any
+# character that might be a shell metacharacter, then use eval to reverse
+# that process (while maintaining the separation between arguments), and wrap
+# the whole thing up as a single "set" statement.
+#
+# This will of course break if any of these variables contains a newline or
+# an unmatched quote.
+#
+
+eval "set -- $(
+        printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
+        xargs -n1 |
+        sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
+        tr '\n' ' '
+    )" '"$@"'
+
+exec "$JAVACMD" "$@"
diff --git a/gradlew.bat b/gradlew.bat
new file mode 100644
index 0000000..ac1b06f
--- /dev/null
+++ b/gradlew.bat
@@ -0,0 +1,89 @@
+@rem
+@rem Copyright 2015 the original author or authors.
+@rem
+@rem Licensed under the Apache License, Version 2.0 (the "License");
+@rem you may not use this file except in compliance with the License.
+@rem You may obtain a copy of the License at
+@rem
+@rem      https://www.apache.org/licenses/LICENSE-2.0
+@rem
+@rem Unless required by applicable law or agreed to in writing, software
+@rem distributed under the License is distributed on an "AS IS" BASIS,
+@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@rem See the License for the specific language governing permissions and
+@rem limitations under the License.
+@rem
+
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Resolve any "." and ".." in APP_HOME to make it shorter.
+for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto execute
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto execute
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/settings.gradle.kts b/settings.gradle.kts
new file mode 100644
index 0000000..2188e23
--- /dev/null
+++ b/settings.gradle.kts
@@ -0,0 +1,9 @@
+pluginManagement {
+    repositories {
+        gradlePluginPortal()
+        google()
+    }
+}
+
+rootProject.name = "kt-ndarray-simd"
+
diff --git a/src/commonMain/kotlin/com/martmists/ndarray/simd/F64Array.kt b/src/commonMain/kotlin/com/martmists/ndarray/simd/F64Array.kt
new file mode 100644
index 0000000..ced145d
--- /dev/null
+++ b/src/commonMain/kotlin/com/martmists/ndarray/simd/F64Array.kt
@@ -0,0 +1,258 @@
+package com.martmists.ndarray.simd
+
+import com.martmists.ndarray.simd.impl.create
+import com.martmists.ndarray.simd.impl.product
+import com.martmists.ndarray.simd.impl.unsupported
+import kotlin.jvm.JvmName
+import kotlin.math.*
+
+
+/**
+ * The default methods are naive implementations for anyone wishing to implement their own NDArrays.
+ * However, I would recommend using delegation to [F64ArrayImpl] for most use-cases.
+ */
+interface F64Array {
+    val data: DoubleArray
+    val offset: Int
+    val strides: IntArray
+    val shape: IntArray
+    val unrollDim: Int
+    val unrollStride: Int
+    val unrollSize: Int
+
+    val nDim: Int
+        get() = shape.size
+    val length: Int
+        get() = shape[0]
+    val isFlattenable: Boolean
+
+    fun checkShape(other: F64Array): F64Array {
+        check(this === other || shape.contentEquals(other.shape)) {
+            "operands shapes do not match: ${shape.contentToString()} vs ${other.shape.contentToString()}"
+        }
+        return other
+    }
+
+    operator fun get(vararg indices: Int): Double
+    operator fun get(r: Int, c: Int): Double
+    operator fun get(d: Int, r: Int, c: Int): Double
+
+    operator fun set(vararg indices: Int, value: Double)
+    operator fun set(r: Int, c: Int, value: Double)
+    operator fun set(d: Int, r: Int, c: Int, value: Double)
+
+    fun along(axis: Int): Sequence<F64Array> = (0 until shape[axis]).asSequence().map { view(it, axis) }
+    fun view(index: Int, axis: Int = 0): F64Array = unsupported()
+
+    val V: Viewer
+
+    fun copy(): F64Array = F64Array.create(data.copyOf(), offset, strides.copyOf(), shape.copyOf())
+    fun copyTo(other: F64Array) = other.zipTransformInPlace(this) { _, d -> d }
+    fun reshape(vararg shape: Int): F64Array = flatten().reshape(*shape)
+    fun flatten(): F64FlatArray = unsupported()
+    fun slice(from: Int = 0, to: Int = -1, step: Int = 1, axis: Int = 0): F64Array
+    operator fun contains(other: Double): Boolean
+    fun fill(value: Double) = transformInPlace { value }
+    fun reorder(indices: IntArray, axis: Int = 0): Unit = unsupported()
+    infix fun dot(other: F64Array): Double = unsupported()
+    fun sum(): Double = reduce { acc, d -> acc + d }
+    fun min(): Double = fold(Double.POSITIVE_INFINITY) { acc, d -> if (d < acc) d else acc }
+    fun max(): Double = fold(Double.NEGATIVE_INFINITY) { acc, d -> if (d > acc) d else acc }
+    fun product(): Double = reduce { acc, d -> acc * d }
+    fun mean(): Double = sum() / shape.product()
+    fun variance(): Double = fold(0.0) { acc, d -> acc + (d - mean()).pow(2) } / shape.product()
+    fun stdDev(): Double = sqrt(variance())
+    fun cumSumInPlace() {
+        var sum = 0.0
+        transformInPlace { sum += it; sum }
+    }
+    fun cumSum(): F64Array = copy().apply { cumSumInPlace() }
+    fun coerceInPlace(min: Double, max: Double) = transformInPlace { it.coerceIn(min, max) }
+    fun coerce(min: Double, max: Double): F64Array = copy().apply { coerceInPlace(min, max) }
+    fun transformInPlace(transform: (Double) -> Double)
+    fun transform(transform: (Double) -> Double): F64Array = copy().apply { transformInPlace(transform) }
+    fun zipTransformInPlace(other: F64Array, transform: (Double, Double) -> Double)
+    fun zipTransform(other: F64Array, transform: (Double, Double) -> Double): F64Array = copy().apply { zipTransformInPlace(other, transform) }
+    fun <T> fold(initial: T, operation: (acc: T, Double) -> T): T
+    fun reduce(operation: (Double, Double) -> Double): Double
+    fun expInPlace() = transformInPlace(::exp)
+    fun exp(): F64Array = copy().apply { expInPlace() }
+    fun expm1InPlace() = transformInPlace(::expm1)
+    fun expm1(): F64Array = copy().apply { expm1InPlace() }
+    fun logInPlace() = transformInPlace(::ln)
+    fun log(): F64Array = copy().apply { logInPlace() }
+    fun log1pInPlace() = transformInPlace { ln(1 + it) }
+    fun log1p(): F64Array = copy().apply { log1pInPlace() }
+    fun log2InPlace() = transformInPlace(::log2)
+    fun log2(): F64Array = copy().apply { log2InPlace() }
+    fun log10InPlace() = transformInPlace(::log10)
+    fun log10(): F64Array = copy().apply { log10InPlace() }
+    fun logBaseInPlace(base: Double) = transformInPlace { log2(it) / log2(base) }  // On some systems this is fastest, on others it's slowest?
+    fun logBase(base: Double): F64Array = copy().apply { logBaseInPlace(base) }
+    fun sqrtInPlace() = transformInPlace(::sqrt)
+    fun sqrt(): F64Array = copy().apply { sqrtInPlace() }
+    fun powInPlace(power: Double) = transformInPlace { it.pow(power) }
+    fun pow(power: Double): F64Array = copy().apply { powInPlace(power) }
+    fun ipowInPlace(base: Double) = transformInPlace { base.pow(it) }
+    fun ipow(base: Double): F64Array = copy().apply { ipowInPlace(base) }
+
+    operator fun unaryPlus(): F64Array = this
+    fun unaryMinusInPlace() = transformInPlace(Double::unaryMinus)
+    operator fun unaryMinus(): F64Array = copy().apply { unaryMinusInPlace() }
+    operator fun plusAssign(other: F64Array) = zipTransformInPlace(other) { a, b -> a + b }
+    operator fun plus(other: F64Array): F64Array = copy().apply { plusAssign(other) }
+    operator fun plusAssign(other: Double) = transformInPlace { it + other }
+    operator fun plus(other: Double): F64Array = copy().apply { plusAssign(other) }
+    operator fun minusAssign(other: F64Array) = zipTransformInPlace(other) { a, b -> a - b }
+    operator fun minus(other: F64Array): F64Array = copy().apply { minusAssign(other) }
+    operator fun minusAssign(other: Double) = transformInPlace { it - other }
+    operator fun minus(other: Double): F64Array = copy().apply { minusAssign(other) }
+    operator fun timesAssign(other: F64Array) = zipTransformInPlace(other) { a, b -> a * b }
+    operator fun times(other: F64Array): F64Array = copy().apply { timesAssign(other) }
+    operator fun timesAssign(other: Double) = transformInPlace { it * other }
+    operator fun times(other: Double): F64Array = copy().apply { timesAssign(other) }
+    operator fun divAssign(other: F64Array) = zipTransformInPlace(other) { a, b -> a / b }
+    operator fun div(other: F64Array): F64Array = copy().apply { divAssign(other) }
+    operator fun divAssign(other: Double) = transformInPlace { it / other }
+    operator fun div(other: Double): F64Array = copy().apply { divAssign(other) }
+    fun absInPlace() = transformInPlace(::abs)
+    fun abs(): F64Array = copy().apply { absInPlace() }
+
+    fun ltInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> if (a < b) 1.0 else 0.0 }
+    infix fun lt(other: F64Array): F64Array = copy().apply { ltInPlace(other) }
+    fun ltInPlace(other: Double) = transformInPlace { if (it < other) 1.0 else 0.0 }
+    infix fun lt(other: Double): F64Array = copy().apply { ltInPlace(other) }
+    fun lteInPlace(other: F64Array) {
+        gtInPlace(other)
+        xorInPlace(1)
+    }
+    infix fun lte(other: F64Array): F64Array = copy().apply { lteInPlace(other) }
+    fun lteInPlace(other: Double) {
+        gtInPlace(other)
+        xorInPlace(1)
+    }
+    infix fun lte(other: Double): F64Array = copy().apply { lteInPlace(other) }
+    fun gtInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> if (a > b) 1.0 else 0.0 }
+    infix fun gt(other: F64Array): F64Array = copy().apply { gtInPlace(other) }
+    fun gtInPlace(other: Double) = transformInPlace { if (it > other) 1.0 else 0.0 }
+    infix fun gt(other: Double): F64Array = copy().apply { gtInPlace(other) }
+    fun gteInPlace(other: F64Array) {
+        ltInPlace(other)
+        xorInPlace(1)
+    }
+    infix fun gte(other: F64Array): F64Array = copy().apply { gteInPlace(other) }
+    fun gteInPlace(other: Double) {
+        ltInPlace(other)
+        xorInPlace(1)
+    }
+    infix fun gte(other: Double): F64Array = copy().apply { gteInPlace(other) }
+    fun eqInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> if (a == b) 1.0 else 0.0 }
+    infix fun eq(other: F64Array): F64Array = copy().apply { eqInPlace(other) }
+    fun eqInPlace(other: Double) = transformInPlace { if (it == other) 1.0 else 0.0 }
+    infix fun eq(other: Double): F64Array = copy().apply { eqInPlace(other) }
+    fun neqInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> if (a != b) 1.0 else 0.0 }
+    infix fun neq(other: F64Array): F64Array = copy().apply { neqInPlace(other) }
+    fun neqInPlace(other: Double) = transformInPlace { if (it != other) 1.0 else 0.0 }
+    infix fun neq(other: Double): F64Array = copy().apply { neqInPlace(other) }
+
+    fun andInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> if (a != 0.0 && b != 0.0) 1.0 else 0.0 }
+    infix fun and(other: F64Array): F64Array = copy().apply { andInPlace(other) }
+    fun andInPlace(other: Int) = transformInPlace { (it.toInt() and other).toDouble() }
+    infix fun and(other: Int): F64Array = copy().apply { andInPlace(other) }
+    fun orInPlace(other: F64Array) = transformInPlace { (it.toInt() or other[it.toInt()].toInt()).toDouble() }
+    infix fun or(other: F64Array): F64Array = copy().apply { orInPlace(other) }
+    fun orInPlace(other: Int) = transformInPlace { (it.toInt() or other).toDouble() }
+    infix fun or(other: Int): F64Array = copy().apply { orInPlace(other) }
+    fun xorInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> if (a != b) 1.0 else 0.0 }
+    infix fun xor(other: F64Array): F64Array = copy().apply { xorInPlace(other) }
+    fun xorInPlace(other: Int) = transformInPlace { (it.toInt() xor other).toDouble() }
+    infix fun xor(other: Int): F64Array = copy().apply { xorInPlace(other) }
+    fun notInPlace() = transformInPlace { it.toInt().inv().toDouble() }
+    fun not(): F64Array = copy().apply { notInPlace() }
+    fun shlInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> (a.toInt() shl b.toInt()).toDouble() }
+    infix fun shl(other: F64Array): F64Array = copy().apply { shlInPlace(other) }
+    fun shlInPlace(other: Int) = transformInPlace { (it.toInt() shl other).toDouble() }
+    infix fun shl(other: Int): F64Array = copy().apply { shlInPlace(other) }
+    fun shrInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> (a.toInt() shr b.toInt()).toDouble() }
+    infix fun shr(other: F64Array): F64Array = copy().apply { shrInPlace(other) }
+    fun shrInPlace(other: Int) = transformInPlace { (it.toInt() shr other).toDouble() }
+    infix fun shr(other: Int): F64Array = copy().apply { shrInPlace(other) }
+
+    fun sinInPlace() = transformInPlace(::sin)
+    fun sin(): F64Array = copy().apply { sinInPlace() }
+    fun cosInPlace() = transformInPlace(::cos)
+    fun cos(): F64Array = copy().apply { cosInPlace() }
+    fun tanInPlace() = transformInPlace(::tan)
+    fun tan(): F64Array = copy().apply { tanInPlace() }
+    fun asinInPlace() = transformInPlace(::asin)
+    fun asin(): F64Array = copy().apply { asinInPlace() }
+    fun acosInPlace() = transformInPlace(::acos)
+    fun acos(): F64Array = copy().apply { acosInPlace() }
+    fun atanInPlace() = transformInPlace(::atan)
+    fun atan(): F64Array = copy().apply { atanInPlace() }
+    fun atan2InPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> atan2(a, b) }
+    fun atan2(other: F64Array): F64Array = copy().apply { atan2InPlace(other) }
+    fun sinhInPlace() = transformInPlace(::sinh)
+    fun sinh(): F64Array = copy().apply { sinhInPlace() }
+    fun coshInPlace() = transformInPlace(::cosh)
+    fun cosh(): F64Array = copy().apply { coshInPlace() }
+    fun tanhInPlace() = transformInPlace(::tanh)
+    fun tanh(): F64Array = copy().apply { tanhInPlace() }
+    fun asinhInPlace() = transformInPlace(::asinh)
+    fun asinh(): F64Array = copy().apply { asinhInPlace() }
+    fun acoshInPlace() = transformInPlace(::acosh)
+    fun acosh(): F64Array = copy().apply { acoshInPlace() }
+    fun atanhInPlace() = transformInPlace(::atanh)
+    fun atanh(): F64Array = copy().apply { atanhInPlace() }
+    fun hypotInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> hypot(a, b) }
+    fun hypot(other: F64Array): F64Array = copy().apply { hypotInPlace(other) }
+
+    infix fun matmul(other: F64Array): F64Array = unsupported()
+
+    fun toDoubleArray(): DoubleArray = unsupported()
+
+    companion object {
+        val simdSize by lazy { NativeSpeedup.getSimdSize() * 2 }
+
+        operator fun invoke(vararg shape: Int) = F64FlatArray.create(DoubleArray(shape.product())).reshape(*shape)
+        operator fun invoke(size: Int, init: (Int) -> Double) = F64Array(size).apply {
+            for (i in 0 until size) {
+                this[i] = init(i)
+            }
+        }
+        operator fun invoke(numRows: Int, numColumns: Int, init: (Int, Int) -> Double) = F64Array(numRows, numColumns).apply {
+            for (r in 0 until numRows) {
+                for (c in 0 until numColumns) {
+                    this[r, c] = init(r, c)
+                }
+            }
+        }
+
+        fun of(vararg values: Double) = F64FlatArray.of(values)
+
+        @JvmName("ofArray")
+        fun of(data: DoubleArray) = F64FlatArray.of(data)
+
+        fun full(shape: IntArray, init: Double): F64Array {
+            return F64FlatArray.create(DoubleArray(shape.product()).apply { fill(init) }).reshape(*shape)
+        }
+
+        fun identity(n: Int): F64Array = zeros(intArrayOf(n, n)).apply {
+            for (i in 0 until n) {
+                this[i, i] = 1.0
+            }
+        }
+
+        fun diagonal(values: DoubleArray): F64Array {
+            val n = values.size
+            val result = zeros(intArrayOf(n, n))
+            for (i in 0 until n) {
+                result[i, i] = values[i]
+            }
+            return result
+        }
+
+        fun zeros(shape: IntArray): F64Array = full(shape, 0.0)
+        fun ones(shape: IntArray): F64Array = full(shape, 1.0)
+    }
+}
diff --git a/src/commonMain/kotlin/com/martmists/ndarray/simd/F64FlatArray.kt b/src/commonMain/kotlin/com/martmists/ndarray/simd/F64FlatArray.kt
new file mode 100644
index 0000000..51a09f1
--- /dev/null
+++ b/src/commonMain/kotlin/com/martmists/ndarray/simd/F64FlatArray.kt
@@ -0,0 +1,30 @@
+package com.martmists.ndarray.simd
+
+import com.martmists.ndarray.simd.impl.create
+
+/**
+ * A 1D specialization type for [F64Array].
+ */
+interface F64FlatArray : F64Array {
+    override val isFlattenable: Boolean
+        get() = true
+
+    override fun checkShape(other: F64Array): F64FlatArray {
+        check(this === other || (other is F64FlatArray && shape[0] == other.shape[0])) {
+            "operands shapes do not match: ${shape.contentToString()} vs ${other.shape.contentToString()}"
+        }
+        return other as F64FlatArray
+    }
+
+    override fun flatten(): F64FlatArray = this
+
+    operator fun get(pos: Int): Double
+    operator fun set(pos: Int, value: Double)
+
+    fun argMin(): Int = (0 until length).minBy(::get)
+    fun argMax(): Int = (0 until length).maxBy(::get)
+
+    companion object {
+        fun of(data: DoubleArray): F64FlatArray = F64FlatArray.create(data)
+    }
+}
diff --git a/src/commonMain/kotlin/com/martmists/ndarray/simd/NativeSpeedup.kt b/src/commonMain/kotlin/com/martmists/ndarray/simd/NativeSpeedup.kt
new file mode 100644
index 0000000..ccf5ccd
--- /dev/null
+++ b/src/commonMain/kotlin/com/martmists/ndarray/simd/NativeSpeedup.kt
@@ -0,0 +1,76 @@
+package com.martmists.ndarray.simd
+
+internal expect object NativeSpeedup {
+    fun vecAddVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecAddScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    fun vecSubVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecSubScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    fun vecMulVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecMulScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    fun vecDivVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecDivScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    fun vecNegate(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecAbs(a: DoubleArray, aOffset: Int, aSize: Int)
+
+    fun vecAndVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecAndScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int)
+    fun vecOrVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecOrScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int)
+    fun vecXorVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecXorScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int)
+    fun vecNot(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecLShiftVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecLShiftScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int)
+    fun vecRShiftVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecRShiftScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int)
+
+    fun vecEqVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecEqScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    fun vecNeqVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecNeqScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    fun vecLtVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecLtScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    fun vecGtVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecGtScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+
+    fun vecSqrt(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecPow(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    fun veciPow(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    fun vecLog(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecLogBase(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    fun vecExp(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecExpm1(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecLog1p(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecLog2(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecLog10(a: DoubleArray, aOffset: Int, aSize: Int)
+
+    fun vecCopy(dest: DoubleArray, destOffset: Int, destSize: Int, src: DoubleArray, srcOffset: Int)
+    fun getSimdSize(): Int
+
+    fun vecSum(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    fun vecMin(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    fun vecMax(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    fun vecProduct(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    fun vecMean(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    fun vecVariance(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    fun vecStdDev(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    fun vecCoerce(a: DoubleArray, aOffset: Int, aSize: Int, min: Double, max: Double)
+
+    fun vecSin(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecCos(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecTan(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecAsin(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecAcos(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecAtan(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecAtan2(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    fun vecSinh(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecCosh(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecTanh(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecAsinh(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecAcosh(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecAtanh(a: DoubleArray, aOffset: Int, aSize: Int)
+    fun vecHypot(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+
+    fun vecDot(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int): Double
+    fun vecMatMul(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int, n: Int, m: Int, p: Int): DoubleArray
+}
diff --git a/src/commonMain/kotlin/com/martmists/ndarray/simd/Viewer.kt b/src/commonMain/kotlin/com/martmists/ndarray/simd/Viewer.kt
new file mode 100644
index 0000000..4c7b02a
--- /dev/null
+++ b/src/commonMain/kotlin/com/martmists/ndarray/simd/Viewer.kt
@@ -0,0 +1,12 @@
+package com.martmists.ndarray.simd
+
+interface Viewer {
+    operator fun get(vararg indices: Int): F64Array
+    operator fun set(vararg indices: Int, other: F64Array)
+    operator fun set(vararg indices: Int, init: Double)
+    operator fun set(any: _I, other: F64Array)
+    operator fun set(any: _I, other: Double)
+    operator fun get(any: _I, c: Int): F64Array
+    operator fun set(any: _I, c: Int, other: F64Array)
+    operator fun set(any: _I, c: Int, init: Double)
+}
diff --git a/src/commonMain/kotlin/com/martmists/ndarray/simd/_I.kt b/src/commonMain/kotlin/com/martmists/ndarray/simd/_I.kt
new file mode 100644
index 0000000..d2621d3
--- /dev/null
+++ b/src/commonMain/kotlin/com/martmists/ndarray/simd/_I.kt
@@ -0,0 +1,3 @@
+package com.martmists.ndarray.simd
+
+object _I
diff --git a/src/commonMain/kotlin/com/martmists/ndarray/simd/compat.kt b/src/commonMain/kotlin/com/martmists/ndarray/simd/compat.kt
new file mode 100644
index 0000000..95ccc85
--- /dev/null
+++ b/src/commonMain/kotlin/com/martmists/ndarray/simd/compat.kt
@@ -0,0 +1,10 @@
+package com.martmists.ndarray.simd
+
+operator fun Double.plus(arr: F64Array): F64Array = arr.plus(this)
+operator fun Double.minus(arr: F64Array): F64Array = arr.minus(this)
+operator fun Double.times(arr: F64Array): F64Array = arr.times(this)
+operator fun Double.div(arr: F64Array): F64Array = arr.div(this)
+
+fun Double.pow(arr: F64Array): F64Array = arr.ipow(this)
+
+fun DoubleArray.toF64Array() = F64Array.of(this)
diff --git a/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64ArrayImpl.kt b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64ArrayImpl.kt
new file mode 100644
index 0000000..7fde269
--- /dev/null
+++ b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64ArrayImpl.kt
@@ -0,0 +1,436 @@
+package com.martmists.ndarray.simd.impl
+
+import com.martmists.ndarray.simd.*
+
+internal open class F64ArrayImpl internal constructor(
+        override val data: DoubleArray,
+        override val offset: Int,
+        override val strides: IntArray,
+        override val shape: IntArray,
+        override val unrollDim: Int,
+        override val unrollStride: Int,
+        override val unrollSize: Int
+) : F64Array {
+    override val isFlattenable = unrollDim == nDim
+
+    protected inline fun F64Array.unsafeIndex(r: Int, c: Int): Int {
+        return offset + r * strides[0] + c * strides[1]
+    }
+
+    protected inline fun F64Array.unsafeIndex(d: Int, r: Int, c: Int): Int {
+        return offset + d * strides[0] + r * strides[1] + c * strides[2]
+    }
+
+    protected inline fun F64Array.unsafeIndex(indices: IntArray): Int {
+        return strides.foldIndexed(offset) { i, acc, stride -> acc + indices[i] * stride }
+    }
+
+    override fun get(vararg indices: Int): Double {
+        check(indices.size == nDim) { "broadcasting get is not supported" }
+        for (d in 0 until nDim) {
+            checkIndex("index", indices[d], shape[d])
+        }
+        return data[unsafeIndex(indices)]
+    }
+
+    override fun get(r: Int, c: Int): Double {
+        check(nDim == 2) { "broadcasting get is not supported" }
+        checkIndex("row", r, shape[0])
+        checkIndex("column", c, shape[1])
+        return data[unsafeIndex(r, c)]
+    }
+
+    override fun get(d: Int, r: Int, c: Int): Double {
+        check(nDim == 3) { "broadcasting get is not supported" }
+        checkIndex("depth", d, shape[0])
+        checkIndex("row", r, shape[1])
+        checkIndex("column", c, shape[2])
+        return data[unsafeIndex(d, r, c)]
+    }
+
+    override fun set(vararg indices: Int, value: Double) {
+        check(indices.size == nDim) { "broadcasting set is not supported" }
+        for (d in 0 until nDim) {
+            checkIndex("index", indices[d], shape[d])
+        }
+        data[unsafeIndex(indices)] = value
+    }
+
+    override operator fun set(r: Int, c: Int, value: Double) {
+        check(nDim == 2) { "broadcasting set is not supported" }
+        checkIndex("row", r, shape[0])
+        checkIndex("column", c, shape[1])
+        data[unsafeIndex(r, c)] = value
+    }
+
+    override operator fun set(d: Int, r: Int, c: Int, value: Double) {
+        check(nDim == 3) { "broadcasting set is not supported" }
+        checkIndex("depth", d, shape[0])
+        checkIndex("row", r, shape[1])
+        checkIndex("column", c, shape[2])
+        data[unsafeIndex(d, r, c)] = value
+    }
+
+    override fun view(index: Int, axis: Int): F64Array {
+        checkIndex("axis", axis, nDim)
+        checkIndex("index", index, shape[axis])
+        return F64Array.create(
+            data, offset + strides[axis] * index,
+            strides.remove(axis), shape.remove(axis)
+        )
+    }
+
+    override val V: Viewer by lazy(LazyThreadSafetyMode.PUBLICATION) { ViewerImpl(this) }
+
+    private class ViewerImpl(private val a: F64Array) : Viewer {
+        override fun get(vararg indices: Int): F64Array = a.view0(indices)
+
+        override fun get(any: _I, c: Int): F64Array = a.view(c, axis=1)
+
+        override fun set(vararg indices: Int, other: F64Array) {
+            other.copyTo(a.view0(indices))
+        }
+
+        override fun set(vararg indices: Int, init: Double) {
+            a.view0(indices).fill(init)
+        }
+
+        override fun set(any: _I, other: F64Array) {
+            other.copyTo(a)
+        }
+
+        override fun set(any: _I, other: Double) {
+            a.fill(other)
+        }
+
+        override fun set(any: _I, c: Int, other: F64Array) {
+            other.copyTo(a.view(c, axis=1))
+        }
+
+        override fun set(any: _I, c: Int, init: Double) {
+            a.view(c, axis=1).fill(init)
+        }
+    }
+
+    override fun copy(): F64Array {
+        return F64Array.full(shape, 0.0).also { this.copyTo(it) }
+    }
+
+    override fun copyTo(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.copyTo(b) }
+    }
+
+    override fun flatten(): F64FlatArray {
+        check(isFlattenable) { "array can't be flattened" }
+        return F64FlatArray.create(data, offset, unrollStride, unrollSize)
+    }
+
+    override fun slice(from: Int, to: Int, step: Int, axis: Int): F64Array {
+        require(step > 0) { "slicing step must be positive, but was $step" }
+        require(axis in 0 until nDim) { "axis out of bounds: $axis" }
+        require(from >= 0) { "slicing start index must be positive, but was $from" }
+        val actualTo = if (to != -1) {
+            require(to > from) { "slicing end index $to must be greater than start index $from" }
+            check(to <= shape[axis]) { "slicing end index out of bounds: $to > ${shape[axis]}" }
+            to
+        } else {
+            check(shape[axis] > from) { "slicing start index out of bounds: $from >= ${shape[axis]}" }
+            shape[axis]
+        }
+
+        val sliceStrides = strides.copyOf().apply { this[axis] *= step }
+        val sliceShape = shape.copyOf().apply {
+            this[axis] = (actualTo - from + step - 1) / step
+        }
+        return F64Array.create(data, offset + from * strides[axis], sliceStrides, sliceShape)
+    }
+
+    override fun contains(other: Double): Boolean = unrollToFlat().any { it.contains(other) }
+
+    override fun fill(value: Double) = flatten().fill(value)
+
+    override fun reorder(indices: IntArray, axis: Int) {
+        reorderInternal(
+            this, indices, axis,
+            get = { pos -> view(pos, axis).copy() },
+            set = { pos, value -> value.copyTo(view(pos, axis)) }
+        )
+    }
+
+    override fun sum(): Double = unrollToFlat().map { it.sum() }.sum()
+
+    override fun min(): Double = unrollToFlat().map { it.min() }.minOrNull() ?: Double.POSITIVE_INFINITY
+
+    override fun max(): Double = unrollToFlat().map { it.max() }.maxOrNull() ?: Double.NEGATIVE_INFINITY
+
+    override fun product(): Double = unrollToFlat().map { it.product() }.reduce(Double::times)
+
+    override fun coerceInPlace(min: Double, max: Double) {
+        unrollToFlat().forEach { it.coerceInPlace(min, max) }
+    }
+
+    override fun transformInPlace(transform: (Double) -> Double) {
+        unrollToFlat().forEach { it.transformInPlace(transform) }
+    }
+
+    override fun zipTransformInPlace(other: F64Array, transform: (Double, Double) -> Double) {
+        commonUnrollToFlat(other) { a, b -> a.zipTransformInPlace(b, transform) }
+    }
+
+    override fun <T> fold(initial: T, operation: (acc: T, Double) -> T): T {
+        if (isFlattenable) {
+            return flatten().fold(initial, operation)
+        }
+        return unrollToFlat().fold(initial) { acc, f64FlatArray -> f64FlatArray.fold(acc, operation) }
+    }
+
+    override fun reduce(operation: (Double, Double) -> Double): Double {
+        if (isFlattenable) {
+            return flatten().reduce(operation)
+        }
+        val sequence = unrollToFlat()
+        val initial = sequence.first().reduce(operation)
+        return sequence.drop(1).fold(initial) { acc, f64FlatArray -> f64FlatArray.fold(acc, operation) }
+    }
+
+    override fun expInPlace() {
+        unrollToFlat().forEach { it.expInPlace() }
+    }
+
+    override fun expm1InPlace() {
+        unrollToFlat().forEach { it.expm1InPlace() }
+    }
+
+    override fun logInPlace() {
+        unrollToFlat().forEach { it.logInPlace() }
+    }
+
+    override fun log1pInPlace() {
+        unrollToFlat().forEach { it.log1pInPlace() }
+    }
+
+    override fun log2InPlace() {
+        unrollToFlat().forEach { it.log2InPlace() }
+    }
+
+    override fun log10InPlace() {
+        unrollToFlat().forEach { it.log10InPlace() }
+    }
+
+    override fun logBaseInPlace(base: Double) {
+        unrollToFlat().forEach { it.logBaseInPlace(base) }
+    }
+
+    override fun sqrtInPlace() {
+        unrollToFlat().forEach { it.sqrtInPlace() }
+    }
+
+    override fun powInPlace(power: Double) {
+        unrollToFlat().forEach { it.powInPlace(power) }
+    }
+
+    override fun ipowInPlace(base: Double) {
+        unrollToFlat().forEach { it.ipowInPlace(base) }
+    }
+
+    override fun unaryMinusInPlace() {
+        unrollToFlat().forEach { it.unaryMinusInPlace() }
+    }
+
+    override fun plusAssign(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.plusAssign(b) }
+    }
+
+    override fun plusAssign(other: Double) {
+        unrollToFlat().forEach { it.plusAssign(other) }
+    }
+
+    override fun minusAssign(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.minusAssign(b) }
+    }
+
+    override fun minusAssign(other: Double) {
+        unrollToFlat().forEach { it.minusAssign(other) }
+    }
+
+    override fun timesAssign(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.timesAssign(b) }
+    }
+
+    override fun timesAssign(other: Double) {
+        unrollToFlat().forEach { it.timesAssign(other) }
+    }
+
+    override fun divAssign(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.divAssign(b) }
+    }
+
+    override fun divAssign(other: Double) {
+        unrollToFlat().forEach { it.divAssign(other) }
+    }
+
+    override fun absInPlace() {
+        unrollToFlat().forEach { it.absInPlace() }
+    }
+
+    override fun ltInPlace(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.ltInPlace(b) }
+    }
+
+    override fun ltInPlace(other: Double) {
+        unrollToFlat().forEach { it.ltInPlace(other) }
+    }
+
+    override fun gtInPlace(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.gtInPlace(b) }
+    }
+
+    override fun gtInPlace(other: Double) {
+        unrollToFlat().forEach { it.gtInPlace(other) }
+    }
+
+    override fun eqInPlace(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.eqInPlace(b) }
+    }
+
+    override fun eqInPlace(other: Double) {
+        unrollToFlat().forEach { it.eqInPlace(other) }
+    }
+
+    override fun neqInPlace(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.neqInPlace(b) }
+    }
+
+    override fun neqInPlace(other: Double) {
+        unrollToFlat().forEach { it.neqInPlace(other) }
+    }
+
+    override fun andInPlace(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.andInPlace(b) }
+    }
+
+    override fun andInPlace(other: Int) {
+        unrollToFlat().forEach { it.andInPlace(other) }
+    }
+
+    override fun orInPlace(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.orInPlace(b) }
+    }
+
+    override fun orInPlace(other: Int) {
+        unrollToFlat().forEach { it.orInPlace(other) }
+    }
+
+    override fun xorInPlace(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.xorInPlace(b) }
+    }
+
+    override fun xorInPlace(other: Int) {
+        unrollToFlat().forEach { it.xorInPlace(other) }
+    }
+
+    override fun notInPlace() {
+        unrollToFlat().forEach { it.notInPlace() }
+    }
+
+    override fun shlInPlace(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.shlInPlace(b) }
+    }
+
+    override fun shlInPlace(other: Int) {
+        unrollToFlat().forEach { it.shlInPlace(other) }
+    }
+
+    override fun shrInPlace(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.shrInPlace(b) }
+    }
+
+    override fun shrInPlace(other: Int) {
+        unrollToFlat().forEach { it.shrInPlace(other) }
+    }
+
+    override fun sinInPlace() {
+        unrollToFlat().forEach { it.sinInPlace() }
+    }
+
+    override fun cosInPlace() {
+        unrollToFlat().forEach { it.cosInPlace() }
+    }
+
+    override fun tanInPlace() {
+        unrollToFlat().forEach { it.tanInPlace() }
+    }
+
+    override fun asinInPlace() {
+        unrollToFlat().forEach { it.asinInPlace() }
+    }
+
+    override fun acosInPlace() {
+        unrollToFlat().forEach { it.acosInPlace() }
+    }
+
+    override fun atanInPlace() {
+        unrollToFlat().forEach { it.atanInPlace() }
+    }
+
+    override fun atan2InPlace(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.atan2InPlace(b) }
+    }
+
+    override fun sinhInPlace() {
+        unrollToFlat().forEach { it.sinhInPlace() }
+    }
+
+    override fun coshInPlace() {
+        unrollToFlat().forEach { it.coshInPlace() }
+    }
+
+    override fun tanhInPlace() {
+        unrollToFlat().forEach { it.tanhInPlace() }
+    }
+
+    override fun asinhInPlace() {
+        unrollToFlat().forEach { it.asinhInPlace() }
+    }
+
+    override fun acoshInPlace() {
+        unrollToFlat().forEach { it.acoshInPlace() }
+    }
+
+    override fun atanhInPlace() {
+        unrollToFlat().forEach { it.atanhInPlace() }
+    }
+
+    override fun hypotInPlace(other: F64Array) {
+        commonUnrollToFlat(other) { a, b -> a.hypotInPlace(b) }
+    }
+
+    override fun matmul(other: F64Array): F64Array {
+        check(nDim == 2) { "matmul is only supported for 2D arrays" }
+        check(other.nDim == 2) { "matmul is only supported for 2D arrays" }
+        check(shape[1] == other.shape[0]) {
+            "matmul dimensions do not match: ${shape[1]} != ${other.shape[0]}"
+        }
+        val resultShape = intArrayOf(shape[0], other.shape[1])
+        val result = F64Array.full(resultShape, 0.0)
+        for (i in 0 until shape[0]) {
+            for (j in 0 until other.shape[1]) {
+                for (k in 0 until shape[1]) {
+                    result[i, j] += this[i, k] * other[k, j]
+                }
+            }
+        }
+        return result
+    }
+
+    override fun equals(other: Any?): Boolean = when {
+        this === other -> true
+        other !is F64Array -> false
+        !shape.contentEquals(other.shape) -> false
+        else -> (0 until length).all { view(it) == other.view(it) }
+    }
+
+    override fun hashCode(): Int = (0 until length).fold(1) { acc, r ->
+        31 * acc + view(r).hashCode()
+    }
+}
diff --git a/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64DenseFlatArrayBase.kt b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64DenseFlatArrayBase.kt
new file mode 100644
index 0000000..367640c
--- /dev/null
+++ b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64DenseFlatArrayBase.kt
@@ -0,0 +1,98 @@
+package com.martmists.ndarray.simd.impl
+
+import com.martmists.ndarray.simd.F64Array
+import com.martmists.ndarray.simd.F64FlatArray
+
+// TODO: Dense array for ND?
+internal abstract class F64DenseFlatArrayBase(
+    data: DoubleArray,
+    offset: Int,
+    size: Int
+) : F64FlatArrayImpl(data, offset, 1, size) {
+    override fun fill(init: Double) = data.fill(init, offset, offset + length)
+
+    override fun copy(): F64FlatArray {
+        val copyData = DoubleArray(length)
+        data.copyInto(copyData, 0, offset, offset + length)
+        return F64FlatArray.create(copyData, 0)
+    }
+
+    override fun copyTo(other: F64Array) {
+        if (other is F64DenseFlatArrayBase) {
+            checkShape(other)
+
+        } else {
+            super.copyTo(other)
+        }
+    }
+
+    override fun toDoubleArray(): DoubleArray {
+        return data.copyOfRange(offset, offset + length)
+    }
+
+    override fun transformInPlace(transform: (Double) -> Double) {
+        var dstOffset = offset
+        val dstEnd = dstOffset + length
+        while (dstOffset < dstEnd) {
+            data[dstOffset] = transform(data[dstOffset])
+            dstOffset++
+        }
+    }
+
+    override fun transform(transform: (Double) -> Double): F64FlatArray {
+        val dst = DoubleArray(length)
+        var srcOffset = offset
+        for (i in 0 until length) {
+            dst[i] = transform(data[srcOffset])
+            srcOffset++
+        }
+        return F64FlatArray.create(dst, 0, length)
+    }
+
+    override fun zipTransformInPlace(other: F64Array, transform: (Double, Double) -> Double) {
+        if (other is F64DenseFlatArrayBase) {
+            checkShape(other)
+            if (offset == 0 && other.offset == 0) {
+                for (i in 0 until length) {
+                    this[i] = transform(data[i], other.data[i])
+                }
+            } else {
+                var dstOffset = offset
+                var srcOffset = other.offset
+                val dstEnd = offset + length
+                while (dstOffset < dstEnd) {
+                    data[dstOffset] = transform(data[dstOffset], other.data[srcOffset])
+                    dstOffset++
+                    srcOffset++
+                }
+            }
+        } else {
+            super.zipTransformInPlace(other, transform)
+        }
+    }
+
+    override fun zipTransform(other: F64Array, transform: (Double, Double) -> Double): F64FlatArray {
+        if (other is F64DenseFlatArrayBase) {
+            checkShape(other)
+            val dst = DoubleArray(length)
+            if (offset == 0 && other.offset == 0) {
+                for (i in 0 until length) {
+                    dst[i] = transform(data[i], other.data[i])
+                }
+            } else {
+                var dstOffset = 0
+                var srcOffset = offset
+                var otherOffset = other.offset
+                for (i in 0 until length) {
+                    dst[dstOffset] = transform(data[srcOffset], other.data[otherOffset])
+                    dstOffset++
+                    srcOffset++
+                    otherOffset++
+                }
+            }
+            return F64FlatArray.create(dst, 0, length)
+        } else {
+            return super.zipTransform(other, transform)
+        }
+    }
+}
diff --git a/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64FlatArrayImpl.kt b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64FlatArrayImpl.kt
new file mode 100644
index 0000000..602d547
--- /dev/null
+++ b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64FlatArrayImpl.kt
@@ -0,0 +1,256 @@
+package com.martmists.ndarray.simd.impl
+
+import com.martmists.ndarray.simd.F64Array
+import com.martmists.ndarray.simd.F64FlatArray
+import kotlin.math.*
+
+internal open class F64FlatArrayImpl internal constructor(
+    data: DoubleArray,
+    offset: Int,
+    stride: Int,
+    size: Int
+) : F64ArrayImpl(data, offset, intArrayOf(stride), intArrayOf(size), 1, stride, size), F64FlatArray {
+    override val isFlattenable: Boolean = true
+
+    protected val unsafeGet: (Int) -> Double = { data[it * stride + offset] }
+    protected val unsafeSet: (Int, Double) -> Unit = { i, v -> data[i * stride + offset] = v }
+
+    override fun flatten(): F64FlatArray = this
+
+    override operator fun get(pos: Int): Double {
+        checkIndex("pos", pos, length)
+        return unsafeGet(pos)
+    }
+
+    override operator fun set(pos: Int, value: Double) {
+        checkIndex("pos", pos, length)
+        unsafeSet(pos, value)
+    }
+
+    override fun contains(other: Double): Boolean {
+        for (pos in 0 until length) {
+            if (unsafeGet(pos) == other) {
+                return true
+            }
+        }
+
+        return false
+    }
+
+    override fun along(axis: Int) = unsupported()
+
+    override fun view(index: Int, axis: Int) = unsupported()
+
+    override fun copyTo(other: F64Array) {
+        val o = checkShape(other)
+        o as F64FlatArrayImpl
+        for (pos in 0 until length) {
+            o.unsafeSet(pos, unsafeGet(pos))
+        }
+    }
+
+    override fun copy(): F64FlatArray = F64FlatArray.create(toDoubleArray(), 0, length)
+
+    override fun fill(init: Double) {
+        for (pos in 0 until length) {
+            unsafeSet(pos, init)
+        }
+    }
+
+    override fun reorder(indices: IntArray, axis: Int) {
+        if (axis == 0) {
+            reorderInternal(this, indices, axis,
+                get = { pos -> unsafeGet(pos) },
+                set = { pos, value -> unsafeSet(pos, value) })
+        } else {
+            unsupported()
+        }
+    }
+
+    private inline fun balancedSum(getter: (Int) -> Double): Double {
+        var accUnaligned = 0.0
+        var remaining = length
+        while (remaining % 4 > 0) {
+            remaining--
+            accUnaligned += getter(remaining)
+        }
+        val stack = DoubleArray(31 - 2)
+        var p = 0
+        var i = 0
+        while (i < remaining) {
+            // Shift.
+            var v = getter(i) + getter(i + 1)
+            val w = getter(i + 2) + getter(i + 3)
+            v += w
+
+            // Reduce.
+            var bitmask = 4
+            while (i and bitmask != 0) {
+                v += stack[--p]
+                bitmask = bitmask shl 1
+            }
+            stack[p++] = v
+            i += 4
+        }
+        var acc = 0.0
+        while (p > 0) {
+            acc += stack[--p]
+        }
+        return acc + accUnaligned
+    }
+
+    override fun dot(other: F64Array) = balancedSum { unsafeGet(it) * other[it] }
+
+    override fun sum(): Double = balancedSum { unsafeGet(it) }
+
+    override fun min() = unsafeGet(argMin())
+
+    override fun argMin(): Int {
+        var minValue = Double.POSITIVE_INFINITY
+        var res = 0
+        for (pos in 0 until length) {
+            val value = unsafeGet(pos)
+            if (value <= minValue) {
+                minValue = value
+                res = pos
+            }
+        }
+        return res
+    }
+
+    override fun max() = unsafeGet(argMax())
+
+    override fun argMax(): Int {
+        var maxValue = Double.NEGATIVE_INFINITY
+        var res = 0
+        for (pos in 0 until length) {
+            val value = unsafeGet(pos)
+            if (value >= maxValue) {
+                maxValue = value
+                res = pos
+            }
+        }
+        return res
+    }
+
+    override fun transformInPlace(transform: (Double) -> Double) {
+        println("transformInPlace: $offset $length ${strides[0]}")
+
+        for (pos in 0 until length) {
+            unsafeSet(pos, transform.invoke(unsafeGet(pos)))
+        }
+    }
+
+    override fun transform(transform: (Double) -> Double): F64FlatArray {
+        val res = DoubleArray(length)
+        for (pos in 0 until length) {
+            res[pos] = transform.invoke(unsafeGet(pos))
+        }
+        return F64FlatArray.create(res)
+    }
+
+    override fun zipTransformInPlace(other: F64Array, transform: (Double, Double) -> Double)  {
+        val o = checkShape(other)
+        o as F64FlatArrayImpl
+        for (pos in 0 until length) {
+            unsafeSet(pos, transform.invoke(unsafeGet(pos), o.unsafeGet(pos)))
+        }
+    }
+
+    override fun zipTransform(other: F64Array, transform: (Double, Double) -> Double): F64FlatArray {
+        val o = checkShape(other)
+        o as F64FlatArrayImpl
+        val res = DoubleArray(length)
+        for (pos in 0 until length) {
+            res[pos] = transform.invoke(unsafeGet(pos), o.unsafeGet(pos))
+        }
+        return F64FlatArray.create(res, 0, length)
+    }
+
+    override fun <T> fold(initial: T, operation: (T, Double) -> T): T {
+        var res = initial
+        for (pos in 0 until length) {
+            res = operation(res, unsafeGet(pos))
+        }
+        return res
+    }
+
+    override fun reduce(operation: (Double, Double) -> Double): Double {
+        var res = unsafeGet(0)
+        for (pos in 1 until length) {
+            res = operation(res, unsafeGet(pos))
+        }
+        return res
+    }
+
+    override fun coerceInPlace(min: Double, max: Double) = transformInPlace { it.coerceIn(min, max) }
+    override fun expInPlace() = transformInPlace(::exp)
+    override fun expm1InPlace() = transformInPlace(::expm1)
+    override fun logInPlace() = transformInPlace(::ln)
+    override fun log1pInPlace() = transformInPlace(::ln1p)
+    override fun log2InPlace() = transformInPlace(::log2)
+    override fun log10InPlace() = transformInPlace(::log10)
+    override fun logBaseInPlace(base: Double) = log2(base).let { lb -> transformInPlace { log2(it) / lb } }
+    override fun sqrtInPlace() = transformInPlace(::sqrt)
+    override fun powInPlace(power: Double) = transformInPlace { it.pow(power) }
+    override fun ipowInPlace(base: Double) = transformInPlace { base.pow(it) }
+    override fun unaryMinusInPlace() = transformInPlace(Double::unaryMinus)
+    override fun plusAssign(other: F64Array) = zipTransformInPlace(other, Double::plus)
+    override fun plusAssign(other: Double) = transformInPlace { it + other }
+    override fun minusAssign(other: F64Array) = zipTransformInPlace(other, Double::minus)
+    override fun minusAssign(other: Double) = transformInPlace { it - other }
+    override fun timesAssign(other: F64Array) = zipTransformInPlace(other, Double::times)
+    override fun timesAssign(other: Double) = transformInPlace { it * other }
+    override fun divAssign(other: F64Array) = zipTransformInPlace(other, Double::div)
+    override fun divAssign(other: Double) = transformInPlace { it / other }
+    override fun absInPlace() = transformInPlace(Double::absoluteValue)
+    override fun ltInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> if (a < b) 1.0 else 0.0 }
+    override fun ltInPlace(other: Double) = transformInPlace { if (it < other) 1.0 else 0.0 }
+    override fun gtInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> if (a > b) 1.0 else 0.0 }
+    override fun gtInPlace(other: Double) = transformInPlace { if (it > other) 1.0 else 0.0 }
+    override fun eqInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> if (a == b) 1.0 else 0.0 }
+    override fun eqInPlace(other: Double) = transformInPlace { if (it == other) 1.0 else 0.0 }
+    override fun neqInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> if (a != b) 1.0 else 0.0 }
+    override fun neqInPlace(other: Double) = transformInPlace { if (it != other) 1.0 else 0.0 }
+    override fun andInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> (a.toInt() and b.toInt()).toDouble() }
+    override fun andInPlace(other: Int) = transformInPlace { (it.toInt() and other).toDouble() }
+    override fun orInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> (a.toInt() or b.toInt()).toDouble() }
+    override fun orInPlace(other: Int) = transformInPlace { (it.toInt() or other).toDouble() }
+    override fun xorInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> (a.toInt() xor b.toInt()).toDouble() }
+    override fun xorInPlace(other: Int) = transformInPlace { (it.toInt() xor other).toDouble() }
+    override fun notInPlace() = transformInPlace { it.toInt().inv().toDouble() }
+    override fun shlInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> (a.toInt() shl b.toInt()).toDouble() }
+    override fun shlInPlace(other: Int) = transformInPlace { (it.toInt() shl other).toDouble() }
+    override fun shrInPlace(other: F64Array) = zipTransformInPlace(other) { a, b -> (a.toInt() shr b.toInt()).toDouble() }
+    override fun shrInPlace(other: Int) = transformInPlace { (it.toInt() shr other).toDouble() }
+    override fun sinInPlace() = transformInPlace(::sin)
+    override fun cosInPlace() = transformInPlace(::cos)
+    override fun tanInPlace() = transformInPlace(::tan)
+    override fun asinInPlace() = transformInPlace(::asin)
+    override fun acosInPlace() = transformInPlace(::acos)
+    override fun atanInPlace() = transformInPlace(::atan)
+    override fun atan2InPlace(other: F64Array) = zipTransformInPlace(other, ::atan2)
+    override fun sinhInPlace() = transformInPlace(::sinh)
+    override fun coshInPlace() = transformInPlace(::cosh)
+    override fun tanhInPlace() = transformInPlace(::tanh)
+    override fun asinhInPlace() = transformInPlace(::asinh)
+    override fun acoshInPlace() = transformInPlace(::acosh)
+    override fun atanhInPlace() = transformInPlace(::atanh)
+    override fun hypotInPlace(other: F64Array) = zipTransformInPlace(other, ::hypot)
+
+    override fun toDoubleArray() = DoubleArray(length) { unsafeGet(it) }
+
+    override fun equals(other: Any?) = when {
+        this === other -> true
+        other !is F64FlatArrayImpl -> false // an instance of F64Array can't be flat
+        length != other.length -> false
+        else -> (0 until length).all {
+            (unsafeGet(it) - other.unsafeGet(it)).absoluteValue < 1e-10
+        }
+    }
+
+    override fun hashCode() = (0 until length).fold(1) { acc, pos ->
+        // XXX calling #hashCode results in boxing, see KT-7571.
+        31 * acc + unsafeGet(pos).hashCode()
+    }
+}
diff --git a/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64LargeDenseFlatArrayImpl.kt b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64LargeDenseFlatArrayImpl.kt
new file mode 100644
index 0000000..d70984d
--- /dev/null
+++ b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64LargeDenseFlatArrayImpl.kt
@@ -0,0 +1,190 @@
+package com.martmists.ndarray.simd.impl
+
+import com.martmists.ndarray.simd.F64Array
+import com.martmists.ndarray.simd.NativeSpeedup
+
+internal class F64LargeDenseFlatArrayImpl(
+    data: DoubleArray,
+    offset: Int,
+    size: Int
+) : F64DenseFlatArrayBase(data, offset, size) {
+    override fun dot(other: F64Array): Double {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            return NativeSpeedup.vecDot(data, offset, length, other.data, other.offset)
+        } else {
+            return super.dot(other)
+        }
+    }
+    override fun sum() = NativeSpeedup.vecSum(data, offset, length)
+    override fun max() = NativeSpeedup.vecMax(data, offset, length)
+    override fun min() = NativeSpeedup.vecMin(data, offset, length)
+    override fun product() = NativeSpeedup.vecProduct(data, offset, length)
+    override fun mean() = NativeSpeedup.vecMean(data, offset, length)
+    override fun variance() = NativeSpeedup.vecVariance(data, offset, length)
+    override fun stdDev() = NativeSpeedup.vecStdDev(data, offset, length)
+    override fun coerceInPlace(min: Double, max: Double) = NativeSpeedup.vecCoerce(data, offset, length, min, max)
+
+    override fun expInPlace() = NativeSpeedup.vecExp(data, offset, length)
+    override fun expm1InPlace() = NativeSpeedup.vecExpm1(data, offset, length)
+    override fun logInPlace() = NativeSpeedup.vecLog(data, offset, length)
+    override fun log1pInPlace() = NativeSpeedup.vecLog1p(data, offset, length)
+    override fun log2InPlace() = NativeSpeedup.vecLog2(data, offset, length)
+    override fun log10InPlace() = NativeSpeedup.vecLog10(data, offset, length)
+    override fun logBaseInPlace(base: Double) = NativeSpeedup.vecLogBase(data, offset, length, base)
+    override fun sqrtInPlace() = NativeSpeedup.vecSqrt(data, offset, length)
+    override fun powInPlace(power: Double) = NativeSpeedup.vecPow(data, offset, length, power)
+    override fun ipowInPlace(base: Double) = NativeSpeedup.veciPow(data, offset, length, base)
+
+    override fun unaryMinusInPlace() = NativeSpeedup.vecNegate(data, offset, length)
+    override fun plusAssign(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecAddVec(data, offset, length, other.data, other.offset)
+        } else {
+            super.plusAssign(other)
+        }
+    }
+    override fun plusAssign(other: Double) = NativeSpeedup.vecAddScalar(data, offset, length, other)
+    override fun minusAssign(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecSubVec(data, offset, length, other.data, other.offset)
+        } else {
+            super.minusAssign(other)
+        }
+    }
+    override fun minusAssign(other: Double) = NativeSpeedup.vecSubScalar(data, offset, length, other)
+    override fun timesAssign(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecMulVec(data, offset, length, other.data, other.offset)
+        } else {
+            super.timesAssign(other)
+        }
+    }
+    override fun timesAssign(other: Double) = NativeSpeedup.vecMulScalar(data, offset, length, other)
+    override fun divAssign(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecDivVec(data, offset, length, other.data, other.offset)
+        } else {
+            super.divAssign(other)
+        }
+    }
+    override fun divAssign(other: Double) = NativeSpeedup.vecDivScalar(data, offset, length, other)
+    override fun absInPlace() = NativeSpeedup.vecAbs(data, offset, length)
+
+    override fun ltInPlace(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecLtVec(data, offset, length, other.data, other.offset)
+        } else {
+            super.ltInPlace(other)
+        }
+    }
+    override fun ltInPlace(other: Double) = NativeSpeedup.vecLtScalar(data, offset, length, other)
+    override fun gtInPlace(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecGtVec(data, offset, length, other.data, other.offset)
+        } else {
+            super.gtInPlace(other)
+        }
+    }
+    override fun gtInPlace(other: Double) = NativeSpeedup.vecGtScalar(data, offset, length, other)
+    override fun eqInPlace(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecEqVec(data, offset, length, other.data, other.offset)
+        } else {
+            super.eqInPlace(other)
+        }
+    }
+    override fun eqInPlace(other: Double) = NativeSpeedup.vecEqScalar(data, offset, length, other)
+    override fun neqInPlace(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecNeqVec(data, offset, length, other.data, other.offset)
+        } else {
+            super.neqInPlace(other)
+        }
+    }
+    override fun neqInPlace(other: Double) = NativeSpeedup.vecNeqScalar(data, offset, length, other)
+
+    override fun andInPlace(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecAndVec(data, offset, length, other.data, other.offset)
+        } else {
+            super.andInPlace(other)
+        }
+    }
+    override fun andInPlace(other: Int) = NativeSpeedup.vecAndScalar(data, offset, length, other)
+    override fun orInPlace(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecOrVec(data, offset, length, other.data, other.offset)
+        } else {
+            super.orInPlace(other)
+        }
+    }
+    override fun orInPlace(other: Int) = NativeSpeedup.vecOrScalar(data, offset, length, other)
+    override fun xorInPlace(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecXorVec(data, offset, length, other.data, other.offset)
+        } else {
+            super.xorInPlace(other)
+        }
+    }
+    override fun xorInPlace(other: Int) = NativeSpeedup.vecXorScalar(data, offset, length, other)
+    override fun notInPlace() = NativeSpeedup.vecNot(data, offset, length)
+    override fun shlInPlace(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecLShiftVec(data, offset, length, other.data, other.offset)
+        } else {
+            super.shlInPlace(other)
+        }
+    }
+    override fun shlInPlace(other: Int) = NativeSpeedup.vecLShiftScalar(data, offset, length, other)
+    override fun shrInPlace(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecRShiftVec(data, offset, length, other.data, other.offset)
+        } else {
+            super.shrInPlace(other)
+        }
+    }
+    override fun shrInPlace(other: Int) = NativeSpeedup.vecRShiftScalar(data, offset, length, other)
+
+    override fun sinInPlace() = NativeSpeedup.vecSin(data, offset, length)
+    override fun cosInPlace() = NativeSpeedup.vecCos(data, offset, length)
+    override fun tanInPlace() = NativeSpeedup.vecTan(data, offset, length)
+    override fun asinInPlace() = NativeSpeedup.vecAsin(data, offset, length)
+    override fun acosInPlace() = NativeSpeedup.vecAcos(data, offset, length)
+    override fun atanInPlace() = NativeSpeedup.vecAtan(data, offset, length)
+    override fun atan2InPlace(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecAtan2(data, offset, length, other.data, other.offset)
+        } else {
+            super.atan2InPlace(other)
+        }
+    }
+    override fun sinhInPlace() = NativeSpeedup.vecSinh(data, offset, length)
+    override fun coshInPlace() = NativeSpeedup.vecCosh(data, offset, length)
+    override fun tanhInPlace() = NativeSpeedup.vecTanh(data, offset, length)
+    override fun asinhInPlace() = NativeSpeedup.vecAsinh(data, offset, length)
+    override fun acoshInPlace() = NativeSpeedup.vecAcosh(data, offset, length)
+    override fun atanhInPlace() = NativeSpeedup.vecAtanh(data, offset, length)
+    override fun hypotInPlace(other: F64Array) {
+        if (other is F64LargeDenseFlatArrayImpl) {
+            checkShape(other)
+            NativeSpeedup.vecHypot(data, offset, length, other.data, other.offset)
+        } else {
+            super.hypotInPlace(other)
+        }
+    }
+}
diff --git a/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64SmallDenseFlatArrayImpl.kt b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64SmallDenseFlatArrayImpl.kt
new file mode 100644
index 0000000..749dccd
--- /dev/null
+++ b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/F64SmallDenseFlatArrayImpl.kt
@@ -0,0 +1,9 @@
+package com.martmists.ndarray.simd.impl
+
+internal class F64SmallDenseFlatArrayImpl(
+    data: DoubleArray,
+    offset: Int,
+    size: Int
+) : F64DenseFlatArrayBase(data, offset, size) {
+
+}
diff --git a/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/internal.kt b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/internal.kt
new file mode 100644
index 0000000..3d57555
--- /dev/null
+++ b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/internal.kt
@@ -0,0 +1,136 @@
+package com.martmists.ndarray.simd.impl
+
+import com.martmists.ndarray.simd.F64Array
+import com.martmists.ndarray.simd.F64FlatArray
+
+internal fun F64Array.view0(indices: IntArray): F64Array {
+    require(indices.size < nDim) { "too many indices" }
+    return indices.fold(this) { m, pos -> m.view(pos) }
+}
+
+internal fun F64Array.unrollOnce(n: Int = unrollDim): Sequence<F64Array> {
+    val newStrides = strides.slice(n until nDim).toIntArray()
+    val newShape = shape.slice(n until nDim).toIntArray()
+    val currentUnrollStride = if (n == unrollDim) unrollStride else run {
+        var nonTrivialN = n - 1
+        while (nonTrivialN >= 0 && shape[nonTrivialN] <= 1) nonTrivialN--
+        if (nonTrivialN >= 0) strides[nonTrivialN] else 0
+    }
+    val currentUnrollSize = if (n == unrollDim) unrollSize else shape.slice(0 until n).toIntArray().product()
+
+    return (0 until currentUnrollSize).asSequence().map { i ->
+        F64Array.create(data, offset + currentUnrollStride * i, newStrides, newShape)
+    }
+}
+
+internal fun F64Array.unrollToFlat(): Sequence<F64FlatArray> {
+    if (isFlattenable) return sequenceOf(flatten())
+    return unrollOnce().flatMap { it.unrollToFlat() }
+}
+
+internal fun F64Array.commonUnrollToFlat(
+    other: F64Array,
+    action: (F64FlatArray, F64FlatArray) -> Unit
+) {
+    checkShape(other)
+    val commonUnrollDim = kotlin.math.min(unrollDim, other.unrollDim)
+    if (commonUnrollDim == nDim) {
+        action(flatten(), other.flatten())
+    } else {
+        unrollOnce(commonUnrollDim).zip(other.unrollOnce(commonUnrollDim)).forEach { (a, b) ->
+            a.commonUnrollToFlat(b, action)
+        }
+    }
+}
+
+internal data class Unroll(val dim: Int, val stride: Int, val size: Int)
+
+internal fun calculateUnroll(strides: IntArray, shape: IntArray): Unroll {
+    var prevStride = 0
+    var unrollable = true
+    var d = 0
+    var s = 0
+    for (i in strides.indices) {
+        if (shape[i] == 1) {
+            if (unrollable) d = i + 1
+            continue
+        }
+        if (unrollable && (prevStride == 0 || prevStride == strides[i] * shape[i])) {
+            d = i + 1
+            s = strides[i]
+        } else {
+            unrollable = false
+        }
+        prevStride = strides[i]
+    }
+    return Unroll(d, s, shape.slice(0 until d).toIntArray().product())
+}
+
+internal fun IntArray.remove(pos: Int) = when (pos) {
+    0 -> sliceArray(1..lastIndex)
+    lastIndex -> sliceArray(0 until lastIndex)
+    else -> sliceArray(0 until pos) + sliceArray(pos + 1..lastIndex)
+}
+
+internal inline fun unsupported(): Nothing = throw UnsupportedOperationException()
+
+internal inline fun <T> reorderInternal(
+    a: F64Array,
+    indices: IntArray,
+    axis: Int,
+    get: (Int) -> T,
+    set: (Int, T) -> Unit
+) {
+    require(indices.size == a.shape[axis])
+
+    val copy = indices.copyOf()
+    for (pos in 0 until a.shape[axis]) {
+        val value = get(pos)
+        var j = pos
+        while (true) {
+            val k = copy[j]
+            copy[j] = j
+            if (k == pos) {
+                set(j, value)
+                break
+            } else {
+                set(j, get(k))
+                j = k
+            }
+        }
+    }
+}
+
+internal fun F64Array.Companion.create(
+    data: DoubleArray,
+    offset: Int,
+    strides: IntArray,
+    shape: IntArray,
+): F64Array {
+    require(strides.size == shape.size) { "strides and shape size don't match" }
+    require(strides.isNotEmpty()) { "singleton arrays are not supported" }
+    return if (shape.size == 1) {
+        F64FlatArray.create(data, offset, strides.single(), shape.single())
+    } else {
+        val (unrollDim, unrollStride, unrollSize) = calculateUnroll(strides, shape)
+        F64ArrayImpl(data, offset, strides, shape, unrollDim, unrollStride, unrollSize)
+    }
+}
+
+internal fun F64FlatArray.Companion.create(
+    data: DoubleArray,
+    offset: Int = 0,
+    stride: Int = 1,
+    size: Int = data.size
+): F64FlatArray {
+    require(size > 0) { "empty arrays not supported" }
+    return if (stride == 1) {
+        if (size <= F64Array.simdSize) {
+            F64SmallDenseFlatArrayImpl(data, offset, size)
+        } else {
+            F64LargeDenseFlatArrayImpl(data, offset, size)
+        }
+    } else {
+        F64FlatArrayImpl(data, offset, stride, size)
+    }
+}
diff --git a/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/util.kt b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/util.kt
new file mode 100644
index 0000000..6abd667
--- /dev/null
+++ b/src/commonMain/kotlin/com/martmists/ndarray/simd/impl/util.kt
@@ -0,0 +1,9 @@
+package com.martmists.ndarray.simd.impl
+
+inline fun checkIndex(label: String, pos: Int, size: Int) {
+    if (pos < 0 || pos >= size) {
+        throw IndexOutOfBoundsException("$label must be in [0, $size), but was $pos")
+    }
+}
+
+inline fun IntArray.product() = fold(1, Int::times)
diff --git a/src/jvmMain/kotlin/com/martmists/ndarray/simd/Main.kt b/src/jvmMain/kotlin/com/martmists/ndarray/simd/Main.kt
new file mode 100644
index 0000000..ab47103
--- /dev/null
+++ b/src/jvmMain/kotlin/com/martmists/ndarray/simd/Main.kt
@@ -0,0 +1,13 @@
+package com.martmists.ndarray.simd
+
+import kotlin.random.Random
+
+fun main() {
+    val arr = DoubleArray(23) { Random.nextDouble() }
+    val f64Array = F64Array.of(arr)
+
+    println(f64Array::class.simpleName)
+
+    println((-f64Array).toDoubleArray().contentToString())
+    println(f64Array.abs().toDoubleArray().contentToString())
+}
diff --git a/src/jvmMain/kotlin/com/martmists/ndarray/simd/NativeSpeedup.jvm.kt b/src/jvmMain/kotlin/com/martmists/ndarray/simd/NativeSpeedup.jvm.kt
new file mode 100644
index 0000000..ee66707
--- /dev/null
+++ b/src/jvmMain/kotlin/com/martmists/ndarray/simd/NativeSpeedup.jvm.kt
@@ -0,0 +1,101 @@
+package com.martmists.ndarray.simd
+
+import java.io.File
+
+internal actual object NativeSpeedup {
+    init {
+        val osName = System.getProperty("os.name")
+        val platform = when {
+            osName.startsWith("Linux") -> "linux"
+            osName.startsWith("Mac") -> "macos"
+            osName.startsWith("Windows") -> "windows"
+            else -> throw UnsupportedOperationException("Unsupported platform: $osName")
+        }
+        val arch = when (val osArch = System.getProperty("os.arch")) {
+            "x86_64", "amd64" -> "X64"
+            "aarch64" -> "Arm64"
+            else -> throw UnsupportedOperationException("Unsupported architecture: $osArch")
+        }
+
+        val tmp = File.createTempFile("libndarray_simd", ".so")
+        tmp.deleteOnExit()
+
+        val lib = NativeSpeedup::class.java.getResourceAsStream("/META-INF/natives/$platform$arch/libndarray_simd.so")!!
+        lib.copyTo(tmp.outputStream())
+
+        System.load(tmp.absolutePath)
+    }
+
+    actual external fun vecAddVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecAddScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    actual external fun vecSubVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecSubScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    actual external fun vecMulVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecMulScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    actual external fun vecDivVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecDivScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    actual external fun vecNegate(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecAbs(a: DoubleArray, aOffset: Int, aSize: Int)
+
+    actual external fun vecAndVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecAndScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int)
+    actual external fun vecOrVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecOrScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int)
+    actual external fun vecXorVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecXorScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int)
+    actual external fun vecNot(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecLShiftVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecLShiftScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int)
+    actual external fun vecRShiftVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecRShiftScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int)
+
+    actual external fun vecEqVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecEqScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    actual external fun vecNeqVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecNeqScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    actual external fun vecLtVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecLtScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    actual external fun vecGtVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecGtScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+
+    actual external fun vecSqrt(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecPow(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    actual external fun veciPow(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    actual external fun vecLog(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecLogBase(a: DoubleArray, aOffset: Int, aSize: Int, b: Double)
+    actual external fun vecExp(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecExpm1(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecLog1p(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecLog2(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecLog10(a: DoubleArray, aOffset: Int, aSize: Int)
+
+    actual external fun vecCopy(dest: DoubleArray, destOffset: Int, destSize: Int, src: DoubleArray, srcOffset: Int)
+    actual external fun getSimdSize(): Int
+
+    actual external fun vecSum(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    actual external fun vecMin(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    actual external fun vecMax(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    actual external fun vecProduct(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    actual external fun vecMean(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    actual external fun vecVariance(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    actual external fun vecStdDev(a: DoubleArray, aOffset: Int, aSize: Int): Double
+    actual external fun vecCoerce(a: DoubleArray, aOffset: Int, aSize: Int, min: Double, max: Double)
+
+    actual external fun vecSin(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecCos(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecTan(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecAsin(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecAcos(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecAtan(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecAtan2(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+    actual external fun vecSinh(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecCosh(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecTanh(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecAsinh(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecAcosh(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecAtanh(a: DoubleArray, aOffset: Int, aSize: Int)
+    actual external fun vecHypot(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int)
+
+    actual external fun vecDot(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int): Double
+    actual external fun vecMatMul(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int, n: Int, m: Int, p: Int): DoubleArray
+}
diff --git a/src/lib/cpp/arithmetic.cpp b/src/lib/cpp/arithmetic.cpp
new file mode 100644
index 0000000..d599f3c
--- /dev/null
+++ b/src/lib/cpp/arithmetic.cpp
@@ -0,0 +1,147 @@
+#include "common.h"
+
+extern "C" {
+    void vec_add_scalar(double* a, double b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = va + b;
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = a[i] + b;
+        }
+    }
+
+    void vec_add_vec(double* a, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = va + vb;
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = a[i] + b[i];
+        }
+    }
+
+    void vec_sub_scalar(double* a, double b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = va - b;
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = a[i] - b;
+        }
+    }
+
+    void vec_sub_vec(double* a, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = va - vb;
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = a[i] - b[i];
+        }
+    }
+
+    void vec_mul_scalar(double* a, double b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = va * b;
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = a[i] * b;
+        }
+    }
+
+    void vec_mul_vec(double* a, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = va * vb;
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = a[i] * b[i];
+        }
+    }
+
+    void vec_div_scalar(double* a, double b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = va / b;
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = a[i] / b;
+        }
+    }
+
+    void vec_div_vec(double* a, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = va / vb;
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = a[i] / b[i];
+        }
+    }
+
+    void vec_negate(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = -va;
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = -a[i];
+        }
+    }
+
+    void vec_abs(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::abs(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::abs(a[i]);
+        }
+    }
+}
diff --git a/src/lib/cpp/bitwise.cpp b/src/lib/cpp/bitwise.cpp
new file mode 100644
index 0000000..6356850
--- /dev/null
+++ b/src/lib/cpp/bitwise.cpp
@@ -0,0 +1,162 @@
+#include "common.h"
+
+extern "C" {
+    void vec_and_vec(double* arr, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = to_int(va) & to_int(vb);
+            xsimd::store_unaligned(&arr[i], to_float(res));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = static_cast<double>(static_cast<int>(arr[i]) & static_cast<int>(b[i]));
+        }
+    }
+
+    void vec_and_scalar(double* arr, int b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto res = to_int(va) & b;
+            xsimd::store_unaligned(&arr[i], to_float(res));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = static_cast<double>(static_cast<int>(arr[i]) & b);
+        }
+    }
+
+    void vec_or_vec(double* arr, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = to_int(va) | to_int(vb);
+            xsimd::store_unaligned(&arr[i], to_float(res));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = static_cast<double>(static_cast<int>(arr[i]) | static_cast<int>(b[i]));
+        }
+    }
+
+    void vec_or_scalar(double* arr, int b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto res = to_int(va) | b;
+            xsimd::store_unaligned(&arr[i], to_float(res));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = static_cast<double>(static_cast<int>(arr[i]) | b);
+        }
+    }
+
+    void vec_xor_vec(double* arr, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = to_int(va) ^ to_int(vb);
+            xsimd::store_unaligned(&arr[i], to_float(res));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = static_cast<double>(static_cast<int>(arr[i]) ^ static_cast<int>(b[i]));
+        }
+    }
+
+    void vec_xor_scalar(double* arr, int b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto res = to_int(va) ^ b;
+            xsimd::store_unaligned(&arr[i], to_float(res));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = static_cast<double>(static_cast<int>(arr[i]) ^ b);
+        }
+    }
+
+    void vec_not(double* arr, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto res = ~to_int(va);
+            xsimd::store_unaligned(&arr[i], to_float(res));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = static_cast<double>(~static_cast<int>(arr[i]));
+        }
+    }
+
+    void vec_lshift_vec(double* arr, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = to_int(va) << to_int(vb);
+            xsimd::store_unaligned(&arr[i], to_float(res));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = static_cast<double>(static_cast<int>(arr[i]) << static_cast<int>(b[i]));
+        }
+    }
+
+    void vec_lshift_scalar(double* arr, int b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto res = to_int(va) << b;
+            xsimd::store_unaligned(&arr[i], to_float(res));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = static_cast<double>(static_cast<int>(arr[i]) << b);
+        }
+    }
+
+    void vec_rshift_vec(double* arr, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = to_int(va) >> to_int(vb);
+            xsimd::store_unaligned(&arr[i], to_float(res));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = static_cast<double>(static_cast<int>(arr[i]) >> static_cast<int>(b[i]));
+        }
+    }
+
+    void vec_rshift_scalar(double* arr, int b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto res = to_int(va) >> b;
+            xsimd::store_unaligned(&arr[i], to_float(res));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = static_cast<double>(static_cast<int>(arr[i]) >> b);
+        }
+    }
+}
diff --git a/src/lib/cpp/common.h b/src/lib/cpp/common.h
new file mode 100644
index 0000000..652c981
--- /dev/null
+++ b/src/lib/cpp/common.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <xsimd/xsimd.hpp>
+#include "lib.h"
+
+constexpr std::size_t simd_size = xsimd::simd_type<double>::size;
+#define MAKE_TRUE() xsimd::batch(1.0)
+#define MAKE_FALSE() xsimd::batch(0.0)
diff --git a/src/lib/cpp/compare.cpp b/src/lib/cpp/compare.cpp
new file mode 100644
index 0000000..91e932c
--- /dev/null
+++ b/src/lib/cpp/compare.cpp
@@ -0,0 +1,135 @@
+#include "common.h"
+
+extern "C" {
+    void vec_eq_vec(double* arr, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+        auto TRUE = MAKE_TRUE();
+        auto FALSE = MAKE_FALSE();
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = va == vb;
+            xsimd::store_unaligned(&arr[i], select(res, TRUE, FALSE));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = arr[i] == b[i] ? 1.0 : 0.0;
+        }
+    }
+
+    void vec_eq_scalar(double* arr, double b, int n) {
+        std::size_t size = n - n % simd_size;
+        auto TRUE = MAKE_TRUE();
+        auto FALSE = MAKE_FALSE();
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto res = va == b;
+            xsimd::store_unaligned(&arr[i], select(res, TRUE, FALSE));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = arr[i] == b ? 1.0 : 0.0;
+        }
+    }
+
+    void vec_neq_vec(double* arr, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+        auto TRUE = MAKE_TRUE();
+        auto FALSE = MAKE_FALSE();
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = va != vb;
+            xsimd::store_unaligned(&arr[i], select(res, TRUE, FALSE));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = arr[i] != b[i] ? 1.0 : 0.0;
+        }
+    }
+
+    void vec_neq_scalar(double* arr, double b, int n) {
+        std::size_t size = n - n % simd_size;
+        auto TRUE = MAKE_TRUE();
+        auto FALSE = MAKE_FALSE();
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto res = va != b;
+            xsimd::store_unaligned(&arr[i], select(res, TRUE, FALSE));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = arr[i] != b ? 1.0 : 0.0;
+        }
+    }
+
+    void vec_lt_vec(double* arr, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+        auto TRUE = MAKE_TRUE();
+        auto FALSE = MAKE_FALSE();
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = va < vb;
+            xsimd::store_unaligned(&arr[i], select(res, TRUE, FALSE));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = arr[i] < b[i] ? 1.0 : 0.0;
+        }
+    }
+
+    void vec_lt_scalar(double* arr, double b, int n) {
+        std::size_t size = n - n % simd_size;
+        auto TRUE = MAKE_TRUE();
+        auto FALSE = MAKE_FALSE();
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto res = va < b;
+            xsimd::store_unaligned(&arr[i], select(res, TRUE, FALSE));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = arr[i] < b ? 1.0 : 0.0;
+        }
+    }
+
+    void vec_gt_vec(double* arr, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+        auto TRUE = MAKE_TRUE();
+        auto FALSE = MAKE_FALSE();
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = va > vb;
+            xsimd::store_unaligned(&arr[i], select(res, TRUE, FALSE));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = arr[i] > b[i] ? 1.0 : 0.0;
+        }
+    }
+
+    void vec_gt_scalar(double* arr, double b, int n) {
+        std::size_t size = n - n % simd_size;
+        auto TRUE = MAKE_TRUE();
+        auto FALSE = MAKE_FALSE();
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&arr[i]);
+            auto res = va > b;
+            xsimd::store_unaligned(&arr[i], select(res, TRUE, FALSE));
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            arr[i] = arr[i] > b ? 1.0 : 0.0;
+        }
+    }
+}
diff --git a/src/lib/cpp/math.cpp b/src/lib/cpp/math.cpp
new file mode 100644
index 0000000..238ced3
--- /dev/null
+++ b/src/lib/cpp/math.cpp
@@ -0,0 +1,147 @@
+#include "common.h"
+
+extern "C" {
+    void vec_sqrt(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::sqrt(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::sqrt(a[i]);
+        }
+    }
+
+    void vec_pow(double* a, double b, int n) {
+        std::size_t size = n - n % simd_size;
+        auto vb = xsimd::batch(b);
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::pow(va, vb);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::pow(a[i], b);
+        }
+    }
+
+    void vec_ipow(double* a, double b, int n) {
+        std::size_t size = n - n % simd_size;
+        auto vb = xsimd::batch(b);
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::pow(vb, va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::pow(a[i], b);
+        }
+    }
+
+    void vec_log(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::log(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::log(a[i]);
+        }
+    }
+
+    void vec_logbase(double* a, double b, int n) {
+        std::size_t size = n - n % simd_size;
+        auto lb = std::log(b);
+        auto vb = xsimd::batch(lb);
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::log(va) / vb;
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::log(a[i]) / lb;
+        }
+    }
+
+    void vec_exp(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::exp(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::exp(a[i]);
+        }
+    }
+
+    void vec_expm1(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::expm1(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::expm1(a[i]);
+        }
+    }
+
+    void vec_log1p(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::log1p(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::log1p(a[i]);
+        }
+    }
+
+    void vec_log2(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+        auto vb = xsimd::batch(std::log(2.0));
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::log(va) / vb;
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::log2(a[i]);
+        }
+    }
+
+    void vec_log10(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::log10(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::log10(a[i]);
+        }
+    }
+}
diff --git a/src/lib/cpp/misc.cpp b/src/lib/cpp/misc.cpp
new file mode 100644
index 0000000..9a5791b
--- /dev/null
+++ b/src/lib/cpp/misc.cpp
@@ -0,0 +1,20 @@
+#include "common.h"
+
+extern "C" {
+    void vec_copy(double* a, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto vb = xsimd::load_unaligned(&b[i]);
+            xsimd::store_unaligned(&a[i], vb);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = b[i];
+        }
+    }
+
+    int get_simd_size() {
+        return simd_size;
+    }
+}
diff --git a/src/lib/cpp/procedure.cpp b/src/lib/cpp/procedure.cpp
new file mode 100644
index 0000000..c9c22c8
--- /dev/null
+++ b/src/lib/cpp/procedure.cpp
@@ -0,0 +1,124 @@
+#include "common.h"
+
+extern "C" {
+    double vec_sum(double* a, int n) {
+        auto sum = xsimd::batch(0.0);
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            sum += va;
+        }
+
+        double result = xsimd::reduce_add(sum);
+
+        for (std::size_t i = size; i < n; ++i) {
+            result += a[i];
+        }
+
+        return result;
+    }
+
+    double vec_min(double* a, int n) {
+        auto min = xsimd::batch(std::numeric_limits<double>::max());
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            min = xsimd::min(min, va);
+        }
+
+        double result = xsimd::reduce_min(min);
+
+        for (std::size_t i = size; i < n; ++i) {
+            result = std::min(result, a[i]);
+        }
+
+        return result;
+    }
+
+    double vec_max(double* a, int n) {
+        auto max = xsimd::batch(std::numeric_limits<double>::lowest());
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            max = xsimd::max(max, va);
+        }
+
+        double result = xsimd::reduce_max(max);
+
+        for (std::size_t i = size; i < n; ++i) {
+            result = std::max(result, a[i]);
+        }
+
+        return result;
+    }
+
+    double vec_prod(double* a, int n) {
+        auto prod = xsimd::batch(1.0);
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            prod *= va;
+        }
+
+        double result = 1.0;
+
+        // FIXME: Figure out how to do this with xsimd::reduce
+        for (std::size_t i = 0; i < simd_size; ++i) {
+            result *= prod.get(i);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            result *= a[i];
+        }
+
+        return result;
+    }
+
+    double vec_mean(double* a, int n) {
+        return vec_sum(a, n) / n;
+    }
+
+    double vec_var(double* a, int n) {
+        double mean = vec_mean(a, n);
+        auto sum = xsimd::batch(0.0);
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            sum += (va - mean) * (va - mean);
+        }
+
+        double result = xsimd::reduce_add(sum);
+
+        for (std::size_t i = size; i < n; ++i) {
+            result += (a[i] - mean) * (a[i] - mean);
+        }
+
+        return result / n;
+    }
+
+    double vec_std(double* a, int n) {
+        return std::sqrt(vec_var(a, n));
+    }
+
+    void vec_coerce(double* a, int n, double min, double max) {
+        auto min_batch = xsimd::batch(min);
+        auto max_batch = xsimd::batch(max);
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            va = xsimd::min(va, max_batch);
+            va = xsimd::max(va, min_batch);
+            xsimd::store_unaligned(&a[i], va);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::min(std::max(a[i], min), max);
+        }
+    }
+}
diff --git a/src/lib/cpp/trigonometry.cpp b/src/lib/cpp/trigonometry.cpp
new file mode 100644
index 0000000..76ca277
--- /dev/null
+++ b/src/lib/cpp/trigonometry.cpp
@@ -0,0 +1,201 @@
+#include "common.h"
+
+extern "C" {
+    void vec_sin(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::sin(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::sin(a[i]);
+        }
+    }
+
+    void vec_cos(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::cos(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::cos(a[i]);
+        }
+    }
+
+    void vec_tan(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::tan(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::tan(a[i]);
+        }
+    }
+
+    void vec_asin(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::asin(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::asin(a[i]);
+        }
+    }
+
+    void vec_acos(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::acos(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::acos(a[i]);
+        }
+    }
+
+    void vec_atan(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::atan(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::atan(a[i]);
+        }
+    }
+
+    void vec_atan2(double* a, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = xsimd::atan2(va, vb);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::atan2(a[i], b[i]);
+        }
+    }
+
+    void vec_sinh(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::sinh(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::sinh(a[i]);
+        }
+    }
+
+    void vec_cosh(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::cosh(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::cosh(a[i]);
+        }
+    }
+
+    void vec_tanh(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::tanh(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::tanh(a[i]);
+        }
+    }
+
+    void vec_asinh(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::asinh(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::asinh(a[i]);
+        }
+    }
+
+    void vec_acosh(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::acosh(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::acosh(a[i]);
+        }
+    }
+
+    void vec_atanh(double* a, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto res = xsimd::atanh(va);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::atanh(a[i]);
+        }
+    }
+
+    void vec_hypot(double* a, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            auto res = xsimd::hypot(va, vb);
+            xsimd::store_unaligned(&a[i], res);
+        }
+
+        for (std::size_t i = size; i < n; ++i) {
+            a[i] = std::hypot(a[i], b[i]);
+        }
+    }
+}
diff --git a/src/lib/cpp/vector.cpp b/src/lib/cpp/vector.cpp
new file mode 100644
index 0000000..6324257
--- /dev/null
+++ b/src/lib/cpp/vector.cpp
@@ -0,0 +1,43 @@
+#include "common.h"
+
+extern "C" {
+    double vec_dot(double* a, double* b, int n) {
+        std::size_t size = n - n % simd_size;
+        auto res = xsimd::batch(0.0);
+
+        for (std::size_t i = 0; i < size; i += simd_size) {
+            auto va = xsimd::load_unaligned(&a[i]);
+            auto vb = xsimd::load_unaligned(&b[i]);
+            res += va * vb;
+        }
+
+        auto result = xsimd::reduce_add(res);
+
+        for (std::size_t i = size; i < n; ++i) {
+            result += a[i] * b[i];
+        }
+
+        return result;
+    }
+
+    void vec_matmul(double* a, double* b, double* c, int n, int m, int p) {
+        auto size = m - (m % simd_size);
+        for (int i = 0; i < n; ++i) {
+            for (int j = 0; j < p; ++j) {
+                auto sum = xsimd::batch(0.0);
+                std::size_t k = 0;
+                for (; k < size; k += simd_size) {
+                    // Need to do unaligned load here
+                    auto va = xsimd::load_unaligned(&a[i * m + k]);
+                    auto vb = xsimd::load_unaligned(&b[k * p + j]);
+                    sum += va * vb;
+                }
+                double scalar_sum = xsimd::reduce_add(sum);
+                for (; k < m; ++k) {
+                    scalar_sum += a[i * m + k] * b[k * p + j];
+                }
+                c[i * p + j] = scalar_sum;
+            }
+        }
+    }
+}
diff --git a/src/lib/public/arithmetic.h b/src/lib/public/arithmetic.h
new file mode 100644
index 0000000..86721dc
--- /dev/null
+++ b/src/lib/public/arithmetic.h
@@ -0,0 +1,12 @@
+#pragma once
+
+void vec_add_vec(double* arr, double* b, int n);
+void vec_add_scalar(double* arr, double b, int n);
+void vec_sub_vec(double* arr, double* b, int n);
+void vec_sub_scalar(double* arr, double b, int n);
+void vec_mul_vec(double* arr, double* b, int n);
+void vec_mul_scalar(double* arr, double b, int n);
+void vec_div_vec(double* arr, double* b, int n);
+void vec_div_scalar(double* arr, double b, int n);
+void vec_negate(double* arr, int n);
+void vec_abs(double* arr, int n);
diff --git a/src/lib/public/bitwise.h b/src/lib/public/bitwise.h
new file mode 100644
index 0000000..5eebe96
--- /dev/null
+++ b/src/lib/public/bitwise.h
@@ -0,0 +1,13 @@
+#pragma once
+
+void vec_and_vec(double* arr, double* b, int n);
+void vec_and_scalar(double* arr, int b, int n);
+void vec_or_vec(double* arr, double* b, int n);
+void vec_or_scalar(double* arr, int b, int n);
+void vec_xor_vec(double* arr, double* b, int n);
+void vec_xor_scalar(double* arr, int b, int n);
+void vec_not(double* arr, int n);
+void vec_lshift_vec(double* arr, double* b, int n);
+void vec_lshift_scalar(double* arr, int b, int n);
+void vec_rshift_vec(double* arr, double* b, int n);
+void vec_rshift_scalar(double* arr, int b, int n);
diff --git a/src/lib/public/compare.h b/src/lib/public/compare.h
new file mode 100644
index 0000000..82ead3a
--- /dev/null
+++ b/src/lib/public/compare.h
@@ -0,0 +1,10 @@
+#pragma once
+
+void vec_eq_vec(double* arr, double* b, int n);
+void vec_eq_scalar(double* arr, double b, int n);
+void vec_neq_vec(double* arr, double* b, int n);
+void vec_neq_scalar(double* arr, double b, int n);
+void vec_lt_vec(double* arr, double* b, int n);
+void vec_lt_scalar(double* arr, double b, int n);
+void vec_gt_vec(double* arr, double* b, int n);
+void vec_gt_scalar(double* arr, double b, int n);
diff --git a/src/lib/public/lib.h b/src/lib/public/lib.h
new file mode 100644
index 0000000..ac5ad0d
--- /dev/null
+++ b/src/lib/public/lib.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "arithmetic.h"
+#include "bitwise.h"
+#include "compare.h"
+#include "math.h"
+#include "misc.h"
+#include "procedure.h"
+#include "trigonometry.h"
+#include "vector.h"
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/lib/public/math.h b/src/lib/public/math.h
new file mode 100644
index 0000000..3cecc55
--- /dev/null
+++ b/src/lib/public/math.h
@@ -0,0 +1,13 @@
+#pragma once
+
+void vec_sqrt(double* arr, int n);
+void vec_pow(double* arr, double b, int n);
+void vec_ipow(double* arr, double b, int n);
+void vec_log(double* arr, int n);
+void vec_logbase(double* arr, double b, int n);
+void vec_exp(double* arr, int n);
+
+void vec_expm1(double* arr, int n);
+void vec_log1p(double* arr, int n);
+void vec_log2(double* arr, int n);
+void vec_log10(double* arr, int n);
diff --git a/src/lib/public/misc.h b/src/lib/public/misc.h
new file mode 100644
index 0000000..2576b21
--- /dev/null
+++ b/src/lib/public/misc.h
@@ -0,0 +1,4 @@
+#pragma once
+
+void vec_copy(double* arr, double* b, int n);
+int get_simd_size();
diff --git a/src/lib/public/procedure.h b/src/lib/public/procedure.h
new file mode 100644
index 0000000..31f3a27
--- /dev/null
+++ b/src/lib/public/procedure.h
@@ -0,0 +1,12 @@
+#pragma once
+
+double vec_sum(double* arr, int n);
+double vec_min(double* arr, int n);
+double vec_max(double* arr, int n);
+double vec_prod(double* arr, int n);
+
+double vec_mean(double* arr, int n);
+double vec_var(double* arr, int n);
+double vec_std(double* arr, int n);
+
+void vec_coerce(double* arr, int n, double min, double max);
diff --git a/src/lib/public/trigonometry.h b/src/lib/public/trigonometry.h
new file mode 100644
index 0000000..6c4132d
--- /dev/null
+++ b/src/lib/public/trigonometry.h
@@ -0,0 +1,16 @@
+#pragma once
+
+void vec_sin(double* arr, int n);
+void vec_cos(double* arr, int n);
+void vec_tan(double* arr, int n);
+void vec_asin(double* arr, int n);
+void vec_acos(double* arr, int n);
+void vec_atan(double* arr, int n);
+void vec_atan2(double* arr, double* b, int n);
+void vec_sinh(double* arr, int n);
+void vec_cosh(double* arr, int n);
+void vec_tanh(double* arr, int n);
+void vec_asinh(double* arr, int n);
+void vec_acosh(double* arr, int n);
+void vec_atanh(double* arr, int n);
+void vec_hypot(double* arr, double* b, int n);
diff --git a/src/lib/public/vector.h b/src/lib/public/vector.h
new file mode 100644
index 0000000..3c68c4b
--- /dev/null
+++ b/src/lib/public/vector.h
@@ -0,0 +1,4 @@
+#pragma once
+
+double vec_dot(double* a, double* b, int n);
+void vec_matmul(double* a, double* b, double* c, int n, int m, int p);
diff --git a/src/nativeMain/cinterops/jni.def b/src/nativeMain/cinterops/jni.def
new file mode 100644
index 0000000..8e21ac8
--- /dev/null
+++ b/src/nativeMain/cinterops/jni.def
@@ -0,0 +1,2 @@
+headers = jni.h
+package = jni
diff --git a/src/nativeMain/cinterops/simd.def b/src/nativeMain/cinterops/simd.def
new file mode 100644
index 0000000..0cb3b1c
--- /dev/null
+++ b/src/nativeMain/cinterops/simd.def
@@ -0,0 +1,4 @@
+package = simd
+headers = public/lib.h
+
+linkerOpts.linux_x64 = -Lbuild/cmake/simd/linuxX64 -lsimd
diff --git a/src/nativeMain/kotlin/com/martmists/ndarray/simd/NativeSpeedup.native.kt b/src/nativeMain/kotlin/com/martmists/ndarray/simd/NativeSpeedup.native.kt
new file mode 100644
index 0000000..b223bd3
--- /dev/null
+++ b/src/nativeMain/kotlin/com/martmists/ndarray/simd/NativeSpeedup.native.kt
@@ -0,0 +1,434 @@
+package com.martmists.ndarray.simd
+
+import kotlinx.cinterop.addressOf
+import kotlinx.cinterop.usePinned
+
+internal actual object NativeSpeedup {
+    actual fun vecAddVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_add_vec(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecAddScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double) {
+        a.usePinned { pinA ->
+            simd.vec_add_scalar(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecSubVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_sub_vec(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecSubScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double) {
+        a.usePinned { pinA ->
+            simd.vec_sub_scalar(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecMulVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_mul_vec(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecMulScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double) {
+        a.usePinned { pinA ->
+            simd.vec_mul_scalar(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecDivVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_div_vec(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecDivScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double) {
+        a.usePinned { pinA ->
+            simd.vec_div_scalar(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecNegate(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_negate(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecAbs(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_abs(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecAndVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_and_vec(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecAndScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int) {
+        a.usePinned { pinA ->
+            simd.vec_and_scalar(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecOrVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_or_vec(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecOrScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int) {
+        a.usePinned { pinA ->
+            simd.vec_or_scalar(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecXorVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_xor_vec(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecXorScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int) {
+        a.usePinned { pinA ->
+            simd.vec_xor_scalar(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecNot(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_not(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecLShiftVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_lshift_vec(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecLShiftScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int) {
+        a.usePinned { pinA ->
+            simd.vec_lshift_scalar(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecRShiftVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_rshift_vec(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecRShiftScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Int) {
+        a.usePinned { pinA ->
+            simd.vec_rshift_scalar(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecEqVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_eq_vec(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecEqScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double) {
+        a.usePinned { pinA ->
+            simd.vec_eq_scalar(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecNeqVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_neq_vec(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecNeqScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double) {
+        a.usePinned { pinA ->
+            simd.vec_neq_scalar(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecLtVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_lt_vec(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecLtScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double) {
+        a.usePinned { pinA ->
+            simd.vec_lt_scalar(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecGtVec(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_gt_vec(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecGtScalar(a: DoubleArray, aOffset: Int, aSize: Int, b: Double) {
+        a.usePinned { pinA ->
+            simd.vec_gt_scalar(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecSqrt(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_sqrt(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecPow(a: DoubleArray, aOffset: Int, aSize: Int, b: Double) {
+        a.usePinned { pinA ->
+            simd.vec_pow(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun veciPow(a: DoubleArray, aOffset: Int, aSize: Int, b: Double) {
+        a.usePinned { pinA ->
+            simd.vec_ipow(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecLog(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_log(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecLogBase(a: DoubleArray, aOffset: Int, aSize: Int, b: Double) {
+        a.usePinned { pinA ->
+            simd.vec_logbase(pinA.addressOf(aOffset), b, aSize)
+        }
+    }
+
+    actual fun vecExp(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_exp(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecExpm1(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_expm1(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecLog1p(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_log1p(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecLog2(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_log2(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecLog10(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_log10(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecCopy(dest: DoubleArray, destOffset: Int, destSize: Int, src: DoubleArray, srcOffset: Int) {
+        dest.usePinned { pinDest ->
+            src.usePinned { pinSrc ->
+                simd.vec_copy(pinDest.addressOf(destOffset), pinSrc.addressOf(srcOffset), destSize)
+            }
+        }
+    }
+
+    actual fun getSimdSize(): Int {
+        return simd.get_simd_size()
+    }
+
+    actual fun vecSum(a: DoubleArray, aOffset: Int, aSize: Int): Double {
+        a.usePinned { pinA ->
+            return simd.vec_sum(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecMin(a: DoubleArray, aOffset: Int, aSize: Int): Double {
+        a.usePinned { pinA ->
+            return simd.vec_min(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecMax(a: DoubleArray, aOffset: Int, aSize: Int): Double {
+        a.usePinned { pinA ->
+            return simd.vec_max(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecProduct(a: DoubleArray, aOffset: Int, aSize: Int): Double {
+        a.usePinned { pinA ->
+            return simd.vec_prod(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecMean(a: DoubleArray, aOffset: Int, aSize: Int): Double {
+        a.usePinned { pinA ->
+            return simd.vec_mean(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecVariance(a: DoubleArray, aOffset: Int, aSize: Int): Double {
+        a.usePinned { pinA ->
+            return simd.vec_var(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecStdDev(a: DoubleArray, aOffset: Int, aSize: Int): Double {
+        a.usePinned { pinA ->
+            return simd.vec_std(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecCoerce(a: DoubleArray, aOffset: Int, aSize: Int, min: Double, max: Double) {
+        a.usePinned { pinA ->
+            simd.vec_coerce(pinA.addressOf(aOffset), aSize, min, max)
+        }
+    }
+
+    actual fun vecSin(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_sin(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecCos(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_cos(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecTan(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_tan(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecAsin(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_asin(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecAcos(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_acos(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecAtan(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_atan(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecAtan2(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_atan2(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecSinh(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_sinh(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecCosh(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_cosh(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecTanh(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_tanh(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecAsinh(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_asinh(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecAcosh(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_acosh(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecAtanh(a: DoubleArray, aOffset: Int, aSize: Int) {
+        a.usePinned { pinA ->
+            simd.vec_atanh(pinA.addressOf(aOffset), aSize)
+        }
+    }
+
+    actual fun vecHypot(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int) {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                simd.vec_hypot(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecDot(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int): Double {
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                return simd.vec_dot(pinA.addressOf(aOffset), pinB.addressOf(bOffset), aSize)
+            }
+        }
+    }
+
+    actual fun vecMatMul(a: DoubleArray, aOffset: Int, aSize: Int, b: DoubleArray, bOffset: Int, n: Int, m: Int, p: Int): DoubleArray {
+        val c = DoubleArray(m * p)
+        a.usePinned { pinA ->
+            b.usePinned { pinB ->
+                c.usePinned { pinC ->
+                    simd.vec_matmul(pinA.addressOf(aOffset), pinB.addressOf(bOffset), pinC.addressOf(0), n, m, p)
+                }
+            }
+        }
+        return c
+    }
+}
diff --git a/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/arithmetic.kt b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/arithmetic.kt
new file mode 100644
index 0000000..25c1455
--- /dev/null
+++ b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/arithmetic.kt
@@ -0,0 +1,139 @@
+package com.martmists.ndarray.simd.jni
+
+import jni.*
+import kotlinx.cinterop.*
+import simd.*
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecAddVec")
+fun jni_vec_add_vec(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_add_vec(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical !!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecAddScalar")
+fun jni_vec_add_scalar(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdouble) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_add_scalar(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecSubVec")
+fun jni_vec_sub_vec(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_sub_vec(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecSubScalar")
+fun jni_vec_sub_scalar(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdouble) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_sub_scalar(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecMulVec")
+fun jni_vec_mul_vec(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_mul_vec(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical !!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecMulScalar")
+fun jni_vec_mul_scalar(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdouble) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_mul_scalar(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecDivVec")
+fun jni_vec_div_vec(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_div_vec(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical !!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecDivScalar")
+fun jni_vec_div_scalar(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdouble) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_div_scalar(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecNegate")
+fun jni_vec_negate(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_negate(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecAbs")
+fun jni_vec_abs(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_abs(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
diff --git a/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/bitwise.kt b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/bitwise.kt
new file mode 100644
index 0000000..dce8d3b
--- /dev/null
+++ b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/bitwise.kt
@@ -0,0 +1,152 @@
+package com.martmists.ndarray.simd.jni
+
+import jni.*
+import kotlinx.cinterop.*
+import simd.*
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecAndVec")
+fun jni_vec_and_vec(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_and_vec(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecAndScalar")
+fun jni_vec_and_scalar(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_and_scalar(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecOrVec")
+fun jni_vec_or_vec(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_or_vec(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecOrScalar")
+fun jni_vec_or_scalar(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_or_scalar(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecXorVec")
+fun jni_vec_xor_vec(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_xor_vec(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecXorScalar")
+fun jni_vec_xor_scalar(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_xor_scalar(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecNot")
+fun jni_vec_not(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_not(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecLShiftVec")
+fun jni_vec_lshift_vec(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_lshift_vec(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecLShiftScalar")
+fun jni_vec_lshift_scalar(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_lshift_scalar(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecRShiftVec")
+fun jni_vec_rshift_vec(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_rshift_vec(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecRShiftScalar")
+fun jni_vec_rshift_scalar(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_rshift_scalar(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
diff --git a/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/compare.kt b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/compare.kt
new file mode 100644
index 0000000..c7acd58
--- /dev/null
+++ b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/compare.kt
@@ -0,0 +1,113 @@
+package com.martmists.ndarray.simd.jni
+
+import jni.*
+import kotlinx.cinterop.*
+import simd.*
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecEqVec")
+fun jni_vec_eq_vec(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_eq_vec(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecEqScalar")
+fun jni_vec_eq_scalar(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdouble) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_eq_scalar(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecNeqVec")
+fun jni_vec_neq_vec(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_neq_vec(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecNeqScalar")
+fun jni_vec_neq_scalar(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdouble) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_neq_scalar(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecGtVec")
+fun jni_vec_gt_vec(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_gt_vec(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecGtScalar")
+fun jni_vec_gt_scalar(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdouble) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_gt_scalar(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecLtVec")
+fun jni_vec_lt_vec(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_lt_vec(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecLtScalar")
+fun jni_vec_lt_scalar(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdouble) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_lt_scalar(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
diff --git a/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/math.kt b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/math.kt
new file mode 100644
index 0000000..e672b3d
--- /dev/null
+++ b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/math.kt
@@ -0,0 +1,125 @@
+package com.martmists.ndarray.simd.jni
+
+import jni.*
+import kotlinx.cinterop.*
+import simd.*
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecSqrt")
+fun jni_vec_sqrt(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_sqrt(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecPow")
+fun jni_vec_pow(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdouble) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_pow(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_veciPow")
+fun jni_veci_pow(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdouble) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_ipow(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecExp")
+fun jni_vec_exp(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_exp(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecLog")
+fun jni_vec_log(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_log(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecLogBase")
+fun jni_vec_log_base(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdouble) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_logbase(refA, b, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecExpm1")
+fun jni_vec_expm1(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_expm1(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecLog1p")
+fun jni_vec_log1p(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_log1p(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecLog2")
+fun jni_vec_log2(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_log2(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecLog10")
+fun jni_vec_log10(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_log10(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
diff --git a/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/misc.kt b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/misc.kt
new file mode 100644
index 0000000..3f98b26
--- /dev/null
+++ b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/misc.kt
@@ -0,0 +1,29 @@
+package com.martmists.ndarray.simd.jni
+
+import jni.JNIEnvVar
+import jni.jdoubleArray
+import jni.jint
+import jni.jobject
+import kotlinx.cinterop.*
+import simd.*
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecCopy")
+fun jni_vec_copy(env: CPointer<JNIEnvVar>, thisObject: jobject, dest: jdoubleArray, destOffset: jint, destSize: jint, src: jdoubleArray, srcOffset: jint) {
+    val size = env.pointed.pointed!!.GetArrayLength!!.invoke(env, src)
+    memScoped {
+        val arrSrc = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, src, null)!!.reinterpret<DoubleVar>()
+        val refSrc = interpretCPointer<DoubleVar>(arrSrc.rawValue + srcOffset * sizeOf<DoubleVar>())
+        val arrDest = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, dest, null)!!.reinterpret<DoubleVar>()
+        val refDest = interpretCPointer<DoubleVar>(arrDest.rawValue + destOffset * sizeOf<DoubleVar>())
+
+        vec_copy(refSrc, refDest, destSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, src, arrSrc, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, dest, arrDest, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_getSimdSize")
+fun jni_get_simd_size(env: CPointer<JNIEnvVar>, thisObject: jobject): jint {
+    return get_simd_size().convert()
+}
diff --git a/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/procedure.kt b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/procedure.kt
new file mode 100644
index 0000000..d44a1ee
--- /dev/null
+++ b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/procedure.kt
@@ -0,0 +1,117 @@
+package com.martmists.ndarray.simd.jni
+
+import jni.*
+import kotlinx.cinterop.*
+import simd.*
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecSum")
+fun jni_vec_sum(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint): Double {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        val result = vec_sum(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+
+        return result
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecMin")
+fun jni_vec_min(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint): Double {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        val result = vec_min(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+
+        return result
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecMax")
+fun jni_vec_max(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint): Double {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        val result = vec_max(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+
+        return result
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecProduct")
+fun jni_vec_product(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint): Double {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        val result = vec_prod(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+
+        return result
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecMean")
+fun jni_vec_mean(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint): Double {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        val result = vec_mean(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+
+        return result
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecVariance")
+fun jni_vec_variance(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint): Double {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        val result = vec_var(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+
+        return result
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecStd")
+fun jni_vec_std(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint): Double {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        val result = vec_std(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+
+        return result
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecCoerce")
+fun jni_vec_coerce(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, min: jdouble, max: jdouble) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        val result = vec_coerce(refA, aSize, min, max)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+
+        return result
+    }
+}
diff --git a/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/trigonometry.kt b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/trigonometry.kt
new file mode 100644
index 0000000..0c3b2b4
--- /dev/null
+++ b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/trigonometry.kt
@@ -0,0 +1,179 @@
+package com.martmists.ndarray.simd.jni
+
+import jni.*
+import kotlinx.cinterop.*
+import simd.*
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecSin")
+fun jni_vec_sin(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_sin(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecCos")
+fun jni_vec_cos(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_cos(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecTan")
+fun jni_vec_tan(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_tan(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecAsin")
+fun jni_vec_asin(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_asin(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecAcos")
+fun jni_vec_acos(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_acos(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecAtan")
+fun jni_vec_atan(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_atan(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecAtan2")
+fun jni_vec_atan2(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_atan2(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, b, arrB, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecSinh")
+fun jni_vec_sinh(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_sinh(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecCosh")
+fun jni_vec_cosh(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_cosh(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecTanh")
+fun jni_vec_tanh(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_tanh(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecAsinh")
+fun jni_vec_asinh(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_asinh(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecAcosh")
+fun jni_vec_acosh(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_acosh(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecAtanh")
+fun jni_vec_atanh(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+
+        vec_atanh(refA, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecHypot")
+fun jni_vec_hypot(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint) {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        vec_hypot(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, b, arrB, 0)
+    }
+}
diff --git a/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/vector.kt b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/vector.kt
new file mode 100644
index 0000000..aa2be26
--- /dev/null
+++ b/src/nativeMain/kotlin/com/martmists/ndarray/simd/jni/vector.kt
@@ -0,0 +1,42 @@
+package com.martmists.ndarray.simd.jni
+
+import jni.*
+import kotlinx.cinterop.*
+import simd.*
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecDot")
+fun jni_vec_dot(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint): jdouble {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+
+        val res = vec_dot(refA, refB, aSize)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical !!.invoke(env, b, arrB, 0)
+
+        return res
+    }
+}
+
+@CName("Java_com_martmists_ndarray_simd_NativeSpeedup_vecMatMul")
+fun jni_vec_mat_mul(env: CPointer<JNIEnvVar>, thisObject: jobject, a: jdoubleArray, aOffset: jint, aSize: jint, b: jdoubleArray, bOffset: jint, m: jint, n: jint, p: jint): jdoubleArray? {
+    memScoped {
+        val arrA = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, a, null)!!.reinterpret<DoubleVar>()
+        val refA = interpretCPointer<DoubleVar>(arrA.rawValue + aOffset * sizeOf<DoubleVar>())
+        val arrB = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, b, null)!!.reinterpret<DoubleVar>()
+        val refB = interpretCPointer<DoubleVar>(arrB.rawValue + bOffset * sizeOf<DoubleVar>())
+        val c = env.pointed.pointed!!.NewDoubleArray!!.invoke(env, m * p)
+        val arrC = env.pointed.pointed!!.GetPrimitiveArrayCritical!!.invoke(env, c, null)!!.reinterpret<DoubleVar>()
+
+        vec_matmul(refA, refB, arrC, m, n, p)
+
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, a, arrA, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, b, arrB, 0)
+        env.pointed.pointed!!.ReleasePrimitiveArrayCritical!!.invoke(env, c, arrC, 0)
+
+        return c
+    }
+}