diff --git a/Cargo.lock b/Cargo.lock index 14cd22e5..f61a56a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "aligned-vec" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc890384c8602f339876ded803c97ad529f3842aba97f6392b3dba0dd171769b" +dependencies = [ + "equator", +] + [[package]] name = "allocator-api2" version = "0.2.21" @@ -97,14 +106,43 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + +[[package]] +name = "arbitrary" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" + [[package]] name = "arg" version = "0.0.0" -source = "git+https://github.com/YdrMaster/InfiniNN?rev=e3061d6#e3061d6078f2fa3a6105ea7815ba6bb96b52495e" +source = "git+https://github.com/CearX/InfiniNN.git?rev=3ba7418#3ba74181913b3705025dad6c8460bc1b899b7243" dependencies = [ "symbolic-expr", ] +[[package]] +name = "arg_enum_proc_macro" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "atomic-waker" version = "1.1.2" @@ -117,6 +155,29 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "av1-grain" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f3efb2ca85bc610acfa917b5aaa36f3fcbebed5b3182d7f877b02531c4b80c8" +dependencies = [ + "anyhow", + "arrayvec", + "log", + "nom", + "num-rational", + "v_frame", +] + +[[package]] +name = "avif-serialize" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ea8ef51aced2b9191c08197f55450d830876d9933f8f48a429b354f1d496b42" +dependencies = [ + "arrayvec", +] + [[package]] name = "backtrace" version = "0.3.75" @@ -144,7 +205,7 @@ version = "0.71.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" dependencies = [ - "bitflags", + "bitflags 2.9.1", "cexpr", "clang-sys", "itertools 0.13.0", @@ -164,7 +225,7 @@ version = "0.72.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f72209734318d0b619a5e0f5129918b848c416e122a3c4ce054e03cb87b726f" dependencies = [ - "bitflags", + "bitflags 2.9.1", "cexpr", "clang-sys", "itertools 0.13.0", @@ -193,12 +254,30 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" +[[package]] +name = "bit_field" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" +[[package]] +name = "bitstream-io" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6099cdc01846bc367c4e7dd630dc5966dccf36b652fae7a74e17b640411a91b2" + [[package]] name = "build-script-cfg" version = "0.0.0" @@ -211,12 +290,50 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c29e9fc2e21f813dfebfc1af5c08b1ebea0a69f79fc6dab7da77c9825efc757" +[[package]] +name = "built" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56ed6191a7e78c36abdb16ab65341eefd73d64d303fffccdbb00d51e4205967b" + [[package]] name = "bumpalo" version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +[[package]] +name = "bytemuck" +version = "1.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c76a5792e44e4abe34d3abf15636779261d45a7450612059293d1d2cfc63422" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "441473f2b4b0459a68628c744bc61d23e730fb00128b841d30fa4bb3972257e4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "byteorder-lite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" + [[package]] name = "bytes" version = "1.10.1" @@ -250,6 +367,8 @@ version = "1.2.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c1599538de2394445747c8cf7935946e3cc27e9625f889d979bfb2aaf569362" dependencies = [ + "jobserver", + "libc", "shlex", ] @@ -262,6 +381,16 @@ dependencies = [ "nom", ] +[[package]] +name = "cfg-expr" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d067ad48b8650848b989a59a86c6c36a995d02d2bf778d45c3c5d57bc2718f02" +dependencies = [ + "smallvec", + "target-lexicon", +] + [[package]] name = "cfg-if" version = "1.0.1" @@ -331,6 +460,12 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + [[package]] name = "colorchoice" version = "1.0.4" @@ -404,6 +539,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam-deque" version = "0.8.6" @@ -435,7 +579,7 @@ version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" dependencies = [ - "bitflags", + "bitflags 2.9.1", "crossterm_winapi", "mio", "parking_lot", @@ -451,7 +595,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" dependencies = [ - "bitflags", + "bitflags 2.9.1", "crossterm_winapi", "derive_more", "document-features", @@ -598,6 +742,15 @@ dependencies = [ "litrs", ] +[[package]] +name = "dyn-stack" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490bd48eb68fffcfed519b4edbfd82c69cbe741d175b84f0e0cbe8c57cbe0bdd" +dependencies = [ + "bytemuck", +] + [[package]] name = "either" version = "1.15.0" @@ -619,6 +772,38 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "enum-as-inner" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "equator" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -638,13 +823,28 @@ dependencies = [ [[package]] name = "exec" version = "0.0.0" -source = "git+https://github.com/YdrMaster/InfiniNN?rev=e3061d6#e3061d6078f2fa3a6105ea7815ba6bb96b52495e" +source = "git+https://github.com/CearX/InfiniNN.git?rev=3ba7418#3ba74181913b3705025dad6c8460bc1b899b7243" dependencies = [ "arg", "graph", "tensor", ] +[[package]] +name = "exr" +version = "1.73.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83197f59927b46c04a183a619b7c29df34e63e63c7869320862268c0ef687e0" +dependencies = [ + "bit_field", + "half", + "lebe", + "miniz_oxide", + "rayon-core", + "smallvec", + "zune-inflate", +] + [[package]] name = "fancy-regex" version = "0.14.0" @@ -662,6 +862,15 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "fdeflate" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" +dependencies = [ + "simd-adler32", +] + [[package]] name = "find_cuda_helper" version = "0.2.0" @@ -671,6 +880,16 @@ dependencies = [ "glob", ] +[[package]] +name = "flate2" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "flexi_logger" version = "0.30.2" @@ -681,7 +900,7 @@ dependencies = [ "log", "nu-ansi-term", "regex", - "thiserror", + "thiserror 2.0.12", ] [[package]] @@ -791,6 +1010,125 @@ dependencies = [ "slab", ] +[[package]] +name = "gemm" +version = "0.18.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab96b703d31950f1aeddded248bc95543c9efc7ac9c4a21fda8703a83ee35451" +dependencies = [ + "dyn-stack", + "gemm-c32", + "gemm-c64", + "gemm-common", + "gemm-f16", + "gemm-f32", + "gemm-f64", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + +[[package]] +name = "gemm-c32" +version = "0.18.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6db9fd9f40421d00eea9dd0770045a5603b8d684654816637732463f4073847" +dependencies = [ + "dyn-stack", + "gemm-common", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + +[[package]] +name = "gemm-c64" +version = "0.18.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfcad8a3d35a43758330b635d02edad980c1e143dc2f21e6fd25f9e4eada8edf" +dependencies = [ + "dyn-stack", + "gemm-common", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + +[[package]] +name = "gemm-common" +version = "0.18.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a352d4a69cbe938b9e2a9cb7a3a63b7e72f9349174a2752a558a8a563510d0f3" +dependencies = [ + "bytemuck", + "dyn-stack", + "half", + "libm", + "num-complex", + "num-traits", + "once_cell", + "paste", + "pulp", + "raw-cpuid", + "rayon", + "seq-macro", + "sysctl", +] + +[[package]] +name = "gemm-f16" +version = "0.18.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff95ae3259432f3c3410eaa919033cd03791d81cebd18018393dc147952e109" +dependencies = [ + "dyn-stack", + "gemm-common", + "gemm-f32", + "half", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "rayon", + "seq-macro", +] + +[[package]] +name = "gemm-f32" +version = "0.18.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc8d3d4385393304f407392f754cd2dc4b315d05063f62cf09f47b58de276864" +dependencies = [ + "dyn-stack", + "gemm-common", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + +[[package]] +name = "gemm-f64" +version = "0.18.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35b2a4f76ce4b8b16eadc11ccf2e083252d8237c1b589558a49b0183545015bd" +dependencies = [ + "dyn-stack", + "gemm-common", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + [[package]] name = "getrandom" version = "0.2.16" @@ -836,6 +1174,16 @@ dependencies = [ "regex", ] +[[package]] +name = "gif" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ae047235e33e2829703574b54fdec96bfbad892062d97fed2f76022287de61b" +dependencies = [ + "color_quant", + "weezl", +] + [[package]] name = "gimli" version = "0.31.1" @@ -851,7 +1199,7 @@ checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "graph" version = "0.0.0" -source = "git+https://github.com/YdrMaster/InfiniNN?rev=e3061d6#e3061d6078f2fa3a6105ea7815ba6bb96b52495e" +source = "git+https://github.com/CearX/InfiniNN.git?rev=3ba7418#3ba74181913b3705025dad6c8460bc1b899b7243" [[package]] name = "h2" @@ -878,8 +1226,10 @@ version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" dependencies = [ + "bytemuck", "cfg-if", "crunchy", + "num-traits", ] [[package]] @@ -1161,6 +1511,45 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "image" +version = "0.25.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db35664ce6b9810857a38a906215e75a9c879f0696556a39f59c62829710251a" +dependencies = [ + "bytemuck", + "byteorder-lite", + "color_quant", + "exr", + "gif", + "image-webp", + "num-traits", + "png", + "qoi", + "ravif", + "rayon", + "rgb", + "tiff", + "zune-core", + "zune-jpeg", +] + +[[package]] +name = "image-webp" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6970fe7a5300b4b42e62c52efa0187540a5bef546c60edaf554ef595d2e6f0b" +dependencies = [ + "byteorder-lite", + "quick-error", +] + +[[package]] +name = "imgref" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0263a3d970d5c054ed9312c0057b4f3bde9c0b33836d3637361d4a9e6e7a408" + [[package]] name = "indexmap" version = "2.10.0" @@ -1203,13 +1592,24 @@ dependencies = [ "syn", ] +[[package]] +name = "interpolate_name" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "io-uring" version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b86e202f00093dcba4275d4636b93ef9dd75d025ae560d2521b45ea28ab49013" dependencies = [ - "bitflags", + "bitflags 2.9.1", "cfg-if", "libc", ] @@ -1236,6 +1636,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.13.0" @@ -1260,6 +1669,22 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "jobserver" +version = "0.1.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" +dependencies = [ + "getrandom 0.3.3", + "libc", +] + +[[package]] +name = "jpeg-decoder" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00810f1d8b74be64b13dbf3db89ac67740615d6c891f0e7b6179326533011a07" + [[package]] name = "js-sys" version = "0.3.77" @@ -1270,12 +1695,28 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lebe" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8" + [[package]] name = "libc" version = "0.2.174" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" +[[package]] +name = "libfuzzer-sys" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5037190e1f70cbeef565bd267599242926f724d3b8a9f510fd7e0b540cfa4404" +dependencies = [ + "arbitrary", + "cc", +] + [[package]] name = "libloading" version = "0.8.8" @@ -1286,6 +1727,12 @@ dependencies = [ "windows-targets 0.53.2", ] +[[package]] +name = "libm" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -1318,13 +1765,18 @@ dependencies = [ "bytesize", "cuda-cc", "ggus", + "half", + "image", "log", "lru 0.14.0", + "mem-rearrange", "memmap2", "minijinja", + "ndarray", + "ndarray-layout", "nn", "operators", - "rand", + "rand 0.9.1", "regex", "search-corex-tools 0.0.0 (git+https://github.com/YdrMaster/cuda-driver?rev=c535e9f)", "search-cuda-tools 0.0.0 (git+https://github.com/YdrMaster/cuda-driver?rev=c535e9f)", @@ -1349,6 +1801,15 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +[[package]] +name = "loop9" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fae87c125b03c1d2c0150c90365d7d6bcc53fb73a9acaef207d2d065860f062" +dependencies = [ + "imgref", +] + [[package]] name = "lru" version = "0.12.5" @@ -1367,10 +1828,30 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "maybe-rayon" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519" +dependencies = [ + "cfg-if", + "rayon", +] + [[package]] name = "mem" version = "0.0.0" -source = "git+https://github.com/YdrMaster/InfiniNN?rev=e3061d6#e3061d6078f2fa3a6105ea7815ba6bb96b52495e" +source = "git+https://github.com/CearX/InfiniNN.git?rev=3ba7418#3ba74181913b3705025dad6c8460bc1b899b7243" dependencies = [ "arg", "exec", @@ -1440,6 +1921,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", + "simd-adler32", ] [[package]] @@ -1484,16 +1966,37 @@ dependencies = [ "search-cuda-tools 0.0.0 (git+https://github.com/YdrMaster/cuda-driver?rev=b0148c0)", ] +[[package]] +name = "ndarray" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + [[package]] name = "ndarray-layout" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86a1db1a5ed8293057d401ebd96872cb881a3693d9b55379ae320f652aea3714" +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + [[package]] name = "nn" version = "0.0.0" -source = "git+https://github.com/YdrMaster/InfiniNN?rev=e3061d6#e3061d6078f2fa3a6105ea7815ba6bb96b52495e" +source = "git+https://github.com/CearX/InfiniNN.git?rev=3ba7418#3ba74181913b3705025dad6c8460bc1b899b7243" dependencies = [ "arg", "graph", @@ -1513,6 +2016,12 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "noop_proc_macro" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" + [[package]] name = "nu-ansi-term" version = "0.50.1" @@ -1532,6 +2041,27 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "bytemuck", + "num-traits", +] + +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "num-integer" version = "0.1.46" @@ -1559,6 +2089,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -1626,7 +2157,7 @@ version = "0.10.73" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8" dependencies = [ - "bitflags", + "bitflags 2.9.1", "cfg-if", "foreign-types", "libc", @@ -1667,13 +2198,14 @@ dependencies = [ [[package]] name = "operators" version = "0.0.0" -source = "git+https://github.com/YdrMaster/operators-rs?rev=01b39e8#01b39e83a2c71069944ee47951f27318f98d407a" +source = "git+https://github.com/CearX/operators-rs.git?rev=8a0d58a#8a0d58a341f548da74056e288e2129dad98f430a" dependencies = [ "build-script-cfg 0.0.0", "cublas", "cuda", "digit-layout", "fslock", + "gemm", "half", "itertools 0.14.0", "libloading", @@ -1724,7 +2256,7 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb45b6331bbdbb54c9a29413703e892ab94f83a31e4a546c778495a91e7fbca" dependencies = [ - "bitflags", + "bitflags 2.9.1", ] [[package]] @@ -1751,12 +2283,34 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "png" +version = "0.17.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82151a2fc869e011c153adc57cf2789ccb8d9906ce52c0b39a6b5697749d7526" +dependencies = [ + "bitflags 1.3.2", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + [[package]] name = "portable-atomic" version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "potential_utf" version = "0.1.2" @@ -1803,6 +2357,54 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "profiling" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3eb8486b569e12e2c32ad3e204dbaba5e4b5b216e9367044f25f1dba42341773" +dependencies = [ + "profiling-procmacros", +] + +[[package]] +name = "profiling-procmacros" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52717f9a02b6965224f95ca2a81e2e0c5c43baacd28ca057577988930b6c3d5b" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "pulp" +version = "0.21.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96b86df24f0a7ddd5e4b95c94fc9ed8a98f1ca94d3b01bdce2824097e7835907" +dependencies = [ + "bytemuck", + "cfg-if", + "libm", + "num-complex", + "reborrow", + "version_check", +] + +[[package]] +name = "qoi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + [[package]] name = "quote" version = "1.0.40" @@ -1818,14 +2420,35 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + [[package]] name = "rand" version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" dependencies = [ - "rand_chacha", - "rand_core", + "rand_chacha 0.9.0", + "rand_core 0.9.3", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", ] [[package]] @@ -1835,7 +2458,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.9.3", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.16", ] [[package]] @@ -1853,7 +2485,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabd94c2f37801c20583fc49dd5cd6b0ba68c716787c2dd6ed18571e1e63117b" dependencies = [ - "bitflags", + "bitflags 2.9.1", "cassowary", "compact_str", "crossterm 0.28.1", @@ -1868,6 +2500,71 @@ dependencies = [ "unicode-width 0.2.0", ] +[[package]] +name = "rav1e" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd87ce80a7665b1cce111f8a16c1f3929f6547ce91ade6addf4ec86a8dda5ce9" +dependencies = [ + "arbitrary", + "arg_enum_proc_macro", + "arrayvec", + "av1-grain", + "bitstream-io", + "built", + "cfg-if", + "interpolate_name", + "itertools 0.12.1", + "libc", + "libfuzzer-sys", + "log", + "maybe-rayon", + "new_debug_unreachable", + "noop_proc_macro", + "num-derive", + "num-traits", + "once_cell", + "paste", + "profiling", + "rand 0.8.5", + "rand_chacha 0.3.1", + "simd_helpers", + "system-deps", + "thiserror 1.0.69", + "v_frame", + "wasm-bindgen", +] + +[[package]] +name = "ravif" +version = "0.11.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5825c26fddd16ab9f515930d49028a630efec172e903483c94796cfe31893e6b" +dependencies = [ + "avif-serialize", + "imgref", + "loop9", + "quick-error", + "rav1e", + "rayon", + "rgb", +] + +[[package]] +name = "raw-cpuid" +version = "11.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6df7ab838ed27997ba19a4664507e6f82b41fe6e20be42929332156e5e85146" +dependencies = [ + "bitflags 2.9.1", +] + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "rayon" version = "1.10.0" @@ -1888,13 +2585,19 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "reborrow" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03251193000f4bd3b042892be858ee50e8b3719f2b08e5833ac4353724632430" + [[package]] name = "redox_syscall" version = "0.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d04b7d0ee6b4a0207a0a7adb104d23ecb0b47d6beae7152d0fa34b692b29fd6" dependencies = [ - "bitflags", + "bitflags 2.9.1", ] [[package]] @@ -1969,6 +2672,12 @@ dependencies = [ "web-sys", ] +[[package]] +name = "rgb" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6a884d2998352bb4daf0183589aec883f16a6da1f4dde84d8e2e9a5409a1ce" + [[package]] name = "ring" version = "0.17.14" @@ -2001,7 +2710,7 @@ version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags", + "bitflags 2.9.1", "errno", "libc", "linux-raw-sys 0.4.15", @@ -2014,7 +2723,7 @@ version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8" dependencies = [ - "bitflags", + "bitflags 2.9.1", "errno", "libc", "linux-raw-sys 0.9.4", @@ -2066,6 +2775,15 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schannel" version = "0.1.27" @@ -2133,7 +2851,7 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags", + "bitflags 2.9.1", "core-foundation", "core-foundation-sys", "libc", @@ -2156,6 +2874,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0f7d95a54511e0c7be3f51e8867aa8cf35148d7b9445d44de2f943e2b206e749" +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.219" @@ -2245,6 +2969,21 @@ dependencies = [ "libc", ] +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + +[[package]] +name = "simd_helpers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95890f873bec569a0362c235787f3aca6e1e887302ba4840839bcc6459c42da6" +dependencies = [ + "quote", +] + [[package]] name = "slab" version = "0.4.10" @@ -2352,13 +3091,27 @@ dependencies = [ "syn", ] +[[package]] +name = "sysctl" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01198a2debb237c62b6826ec7081082d951f46dbb64b0e8c7649a452230d1dfc" +dependencies = [ + "bitflags 2.9.1", + "byteorder", + "enum-as-inner", + "libc", + "thiserror 1.0.69", + "walkdir", +] + [[package]] name = "system-configuration" version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" dependencies = [ - "bitflags", + "bitflags 2.9.1", "core-foundation", "system-configuration-sys", ] @@ -2373,6 +3126,25 @@ dependencies = [ "libc", ] +[[package]] +name = "system-deps" +version = "6.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e535eb8dded36d55ec13eddacd30dec501792ff23a0b1682c38601b8cf2349" +dependencies = [ + "cfg-expr", + "heck", + "pkg-config", + "toml", + "version-compare", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + [[package]] name = "tempfile" version = "3.20.0" @@ -2395,13 +3167,33 @@ dependencies = [ "ndarray-layout", ] +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + [[package]] name = "thiserror" version = "2.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" dependencies = [ - "thiserror-impl", + "thiserror-impl 2.0.12", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -2415,6 +3207,17 @@ dependencies = [ "syn", ] +[[package]] +name = "tiff" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba1310fcea54c6a9a4fd1aad794ecc02c31682f6bfbecdf460bf19533eed1e3e" +dependencies = [ + "flate2", + "jpeg-decoder", + "weezl", +] + [[package]] name = "tinystr" version = "0.8.1" @@ -2562,7 +3365,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" dependencies = [ - "bitflags", + "bitflags 2.9.1", "bytes", "futures-util", "http", @@ -2675,12 +3478,45 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "v_frame" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "666b7727c8875d6ab5db9533418d7c764233ac9c0cff1d469aec8fa127597be2" +dependencies = [ + "aligned-vec", + "num-traits", + "wasm-bindgen", +] + [[package]] name = "vcpkg" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "version-compare" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852e951cb7832cb45cb1169900d19760cfa39b82bc0ea9c0e5a14ae88411c98b" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -2809,6 +3645,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "weezl" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a751b3277700db47d3e574514de2eced5e54dc8a5436a3bf7a0b248b2cee16f3" + [[package]] name = "winapi" version = "0.3.9" @@ -2825,6 +3667,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -3071,7 +3922,7 @@ version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ - "bitflags", + "bitflags 2.9.1", ] [[package]] @@ -3210,3 +4061,27 @@ dependencies = [ "quote", "syn", ] + +[[package]] +name = "zune-core" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f423a2c17029964870cfaabb1f13dfab7d092a62a29a89264f4d36990ca414a" + +[[package]] +name = "zune-inflate" +version = "0.2.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "zune-jpeg" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9e525af0a6a658e031e95f14b7f889976b74a11ba0eca5a5fc9ac8a1c43a6a" +dependencies = [ + "zune-core", +] diff --git a/llama.cu/Cargo.toml b/llama.cu/Cargo.toml index 3dce982a..661091b5 100644 --- a/llama.cu/Cargo.toml +++ b/llama.cu/Cargo.toml @@ -4,10 +4,8 @@ version = "0.0.0" edition.workspace = true [dependencies] -operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "01b39e8", default-features = false, features = [ - "nvidia-gpu", -] } -nn = { git = "https://github.com/YdrMaster/InfiniNN", rev = "e3061d6" } +operators = { git = "https://github.com/CearX/operators-rs.git", rev = "8a0d58a", default-features = false, features = ["nvidia-gpu", "common-cpu"] } +nn = { git = "https://github.com/CearX/InfiniNN.git", rev = "3ba7418" } ggus = { git = "https://github.com/InfiniTensor/gguf", rev = "23c362f" } tokeneer = { git = "https://github.com/InfiniTensor/tokeneer", rev = "c48f39f" } @@ -23,6 +21,11 @@ minijinja = { version = "2.11", default-features = false, features = [ "builtins", "serde", ] } +half = "2.4" +ndarray-layout = "0.2" +mem-rearrange = "0.1.1" +image = "0.25.6" +ndarray = "0.16.1" [build-dependencies] build-script-cfg = "0.1" diff --git a/llama.cu/src/exec/encoder.rs b/llama.cu/src/exec/encoder.rs new file mode 100644 index 00000000..88dc6433 --- /dev/null +++ b/llama.cu/src/exec/encoder.rs @@ -0,0 +1,126 @@ +use crate::{ + exec::{ + engine::BufN, + group::{ModelGroupConfig, ModelGroupQw2vl}, + upos, + }, + handle::Handle, + model::{image::qw2vl_image_preprocess, qw2vl_mmproj::build_pos_ids}, +}; +use nn::Distribution; +use operators::{ + Operator as _, + attention::common_cpu::Operator as AttnCpu, + // attention::cuda::Operator as Attn, + common_cpu::Cpu, + conv::cuda::ConvIm2Col, + cuda::{Device, Gpu}, + rearrange::cuda::Operator as Rearrange, +}; +use std::{env::var_os, path::PathBuf, time::Instant}; + +#[allow(dead_code)] +pub(crate) fn model_from_env() -> PathBuf { + let Some(model) = var_os("TEST_MODEL").map(PathBuf::from) else { + panic!("TEST_MODEL not set"); + }; + model +} + +#[allow(dead_code)] +pub fn qw2vl_infer(model_path: PathBuf, image: PathBuf, use_cuda_graph: bool) { + use crate::model::{GGufModel, map_files}; + use operators::cuda; + // 初始化 CUDA + assert!(cuda::init().is_ok()); + // 加载 model 和 image + let maps = map_files(model_path); + let mut gguf = GGufModel::read(maps.iter().map(|x| &**x)); + let d_patch = 14; + let image_mean: [f32; 3] = [0.481_454_66, 0.457_827_5, 0.408_210_73]; + let image_std: [f32; 3] = [0.268_629_54, 0.261_302_6, 0.275_777_1]; // todo: ggus + let image = qw2vl_image_preprocess(image, image_mean, image_std); + let image_shape = <[usize; 4]>::try_from(image.shape().to_vec()).unwrap(); + let [n, _c, h, w] = image_shape; + let patches = (h / d_patch) * (w / d_patch); + let nctx = (h / d_patch).max(w / d_patch); + gguf.insert_sin_cos_qw2vl(nctx); + let model = gguf.qw2vl_mmproj(nctx); + // 初始化算子 + let device = Device::new(0); + let gpu = Gpu::new(device.retain_primary(), Default::default()); + // let attn = Attn::new(&gpu); + let attn = AttnCpu::new(&Cpu); + let conv = ConvIm2Col::new(&gpu); + let rearrange = Rearrange::new(&gpu); + + gpu.apply(|ctx| { + let mut handle = Handle::new(ctx); + let dist = Distribution { + start: 0, + len: 1, + total: 1, + }; + let mut models = ModelGroupQw2vl::new( + model, + image_shape, + d_patch, + dist, + None, + ModelGroupConfig { + static_model_keys: [patches], + dyn_cache_size: 1, + use_cuda_graph, + }, + attn, + conv, + rearrange, + &mut handle, + None, + ); + + // 保存 pos_ids 和 image + let pos_ids = build_pos_ids(h, w, d_patch); + let pos_len = pos_ids.len(); + let image_data = image.take(); + let image_len = image_data.len(); + const BUF_LEVEL: usize = 3; + let mut image_buf = BufN::::new(image_len, BUF_LEVEL, ctx); + let mut pos_buf = BufN::::new(pos_len, BUF_LEVEL, ctx); + image_buf.save(image_data.as_slice()); + pos_buf.save(&pos_ids); + + // 加载到设备 + let stream = ctx.stream(); + let (key, _tok_buf) = models.load_inputs_qw2vl_mmproj( + &mut handle, + image_len, + pos_len, + &image_buf, + &pos_buf, + image_shape, + d_patch, + &stream, + ); + + // 推理 + let time = Instant::now(); + let reqs = vec![]; // QW2VLMMProj 不需要 cache + let _x = models.launch(key, &reqs, &mut handle, &stream); + // utils::fmt(&_x, stream.ctx()); + println!("encode {n} x {h} x {w} image in {:?}", time.elapsed()); + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + // #[test] + fn _test_qw2vl_infer() { + use crate::model::image::image_from_env; + let model = model_from_env(); + let image = image_from_env(); + qw2vl_infer(model, image, false); + } +} diff --git a/llama.cu/src/exec/engine.rs b/llama.cu/src/exec/engine.rs index aef9101b..109a2150 100644 --- a/llama.cu/src/exec/engine.rs +++ b/llama.cu/src/exec/engine.rs @@ -425,7 +425,7 @@ fn out_idx(reqs: &[Req], outs: impl IntoIterator) -> Vec { +pub(crate) struct BufN<'ctx, T> { buf: HostMem<'ctx>, index: usize, level: usize, @@ -433,7 +433,7 @@ struct BufN<'ctx, T> { } impl<'ctx, T: Copy> BufN<'ctx, T> { - fn new(len: usize, level: usize, ctx: &'ctx CurrentCtx) -> Self { + pub(crate) fn new(len: usize, level: usize, ctx: &'ctx CurrentCtx) -> Self { Self { buf: ctx.malloc_host::(len * level), index: 0, @@ -444,7 +444,7 @@ impl<'ctx, T: Copy> BufN<'ctx, T> { } impl BufN<'_, T> { - fn save(&mut self, data: &[T]) { + pub(crate) fn save(&mut self, data: &[T]) { let data = unsafe { std::slice::from_raw_parts(data.as_ptr().cast(), size_of_val(data)) }; if self.index + 1 == self.level { diff --git a/llama.cu/src/exec/group.rs b/llama.cu/src/exec/group.rs index 8b8ca441..eecbf864 100644 --- a/llama.cu/src/exec/group.rs +++ b/llama.cu/src/exec/group.rs @@ -1,11 +1,15 @@ use super::{CacheParts, Progress, model::ModelExec, upos}; use crate::{batch::Req, handle::Handle, load::load_weight, memory::MemPages}; use nn::{ - Distribution, Graph, GraphBuilder, LLaMA, NNGraph, Tensor, TensorMeta, digit_layout::types, op, + Distribution, Graph, GraphBuilder, LLaMA, NNGraph, Qwen2VLmmproj, Tensor, TensorMeta, + digit_layout::types, op, }; use operators::{ + attention::common_cpu::Operator as AttnCpu, attention_kv_cached::cuda::Operator as Attn, + conv::cuda::ConvIm2Col, cuda::{DevByte, DevMem, Stream, VirByte}, + rearrange::cuda::Operator as Rearrange, }; use std::{ collections::BTreeMap, @@ -158,6 +162,169 @@ impl<'ctx> ModelGroup<'ctx> { } } +pub(crate) struct ModelGroupQw2vl<'ctx> { + internal: Internal<'ctx>, + attn: AttnCpu, + conv: ConvIm2Col, + rearrange: Rearrange, + pages: MemPages, + _weight: DevMem<'ctx>, +} + +impl<'ctx> ModelGroupQw2vl<'ctx> { + #[allow(dead_code)] + #[allow(clippy::too_many_arguments)] + pub fn new>( + qw2vl: Qwen2VLmmproj>, + image_shape: [usize; 4], + d_patch: usize, + dist: Distribution, + progress: Option>, + config: ModelGroupConfig, + + attn: AttnCpu, + conv: ConvIm2Col, + rearrange: Rearrange, + handle: &mut Handle<'ctx>, + barrier: Option<&Barrier>, + ) -> Self { + let ModelGroupConfig { + static_model_keys, + mut dyn_cache_size, + use_cuda_graph, + } = config; + // 构建计算图 + let NNGraph(Graph { topo, nodes, edges }) = builder() + .build( + qw2vl.tensor_parallel(dist), + [ + TensorMeta::new( + types::F16, + [ + "n_img".into(), + "channels".into(), + "h_img".into(), + "w_img".into(), + ], + ), + TensorMeta::new(types::U32, ["patches".into(), 2.into()]), + ], + ) + .unwrap(); + // 加载权重 + let dev = handle.ctx.dev(); + let mut pages = MemPages::new(dev); + let (_weight, edges) = load_weight(edges, progress, handle.ctx); + // 构建 cuda graph + let graph = NNGraph(Graph { topo, nodes, edges }); + let static_models = if use_cuda_graph { + static_model_keys + .into_iter() + .map(|n_tok| { + if let Some(b) = barrier { + b.wait(); + } + let key = NonZeroUsize::new(n_tok).unwrap(); + let exec = ModelExec::new_qw2vl( + graph.clone(), + image_shape[0], + image_shape[1], + image_shape[2], + image_shape[3], + d_patch, + handle, + &mut pages, + false, + ); + (key, exec) + }) + .collect::>() + } else { + dyn_cache_size += static_model_keys.into_iter().count(); + Default::default() + }; + + let models_with_one_dyn = Internal::new(graph, static_models, dyn_cache_size); + Self { + internal: models_with_one_dyn, + attn, + rearrange, + conv, + pages, + _weight, + } + } + + #[allow(dead_code)] + #[allow(clippy::too_many_arguments)] + pub fn load_inputs_qw2vl_mmproj( + &mut self, + handle: &mut Handle<'ctx>, + image_len: usize, + pos_len: usize, + image_buf: &[u8], + pos_buf: &[upos], + image_shape: [usize; 4], + d_patch: usize, + stream: &Stream<'ctx>, + ) -> (NonZeroUsize, &mut [DevByte]) { + let key = self.internal.get_key(NonZeroUsize::new(image_len).unwrap()); + let model = self.internal.map_exec_qw2vl( + key, + handle, + &mut self.pages, + stream, + image_shape, + d_patch, + ); + stream.memcpy_h2d(model.tok_buf(), &image_buf[..key.get()]); + stream.memcpy_h2d(model.pos_buf(), &pos_buf[..pos_len]); + (key, model.tok_buf()) + } + + #[allow(dead_code)] + pub fn launch( + &mut self, + key: NonZeroUsize, + reqs: &[Req], + handle: &mut Handle, + stream: &Stream<'ctx>, + ) -> Tensor<*const VirByte, 2> { + let Self { + internal, + attn, + conv, + rearrange, + pages, + .. + } = self; + + let mut reqs = reqs + .iter() + .map(|req| Req { + cache: req.cache.0[handle.rank()].lock().unwrap(), + pos: req.pos, + seq: req.seq, + }) + .collect::>(); + let reqs = reqs + .iter_mut() + .map(|req| { + req.cache.update(req.pos + req.seq, pages); + Req { + cache: req.cache.as_tensor(), + pos: req.pos, + seq: req.seq, + } + }) + .collect::>(); + + internal + .get_mut(&key) + .unwrap() + .launch_qw2vl(attn, conv, rearrange, handle, &reqs, stream) + } +} struct Internal<'ctx> { static_models: BTreeMap>, dyn_model_cache: lru::LruCache>, @@ -231,20 +398,76 @@ impl<'ctx> Internal<'ctx> { model.map(pages); model } + + fn map_exec_qw2vl( + &mut self, + key: NonZero, + handle: &mut Handle<'ctx>, + pages: &mut MemPages, + stream: &Stream<'ctx>, + image_shape: [usize; 4], + d_patch: usize, + ) -> &mut ModelExec<'ctx> { + // 检查当前映射的模型 + if let Some(mapped) = self.mapped { + if mapped == key { + return self.get_mut(&key).unwrap(); + } + // 当前映射的模型不是要映射的模型,解映射 + if let Some(mapped) = self.get_mut(&mapped) { + stream.synchronize(); + mapped.unmap(pages) + } + } + let Self { + static_models, + dyn_model_cache, + mapped, + graph, + } = self; + // 更新记录 + *mapped = Some(key); + // 查找或新建模型 + let model = static_models.get_mut(&key).unwrap_or_else(|| { + dyn_model_cache.get_or_insert_mut(key, || { + log::info!("create modelExec for key {}", key.get()); + ModelExec::new_qw2vl( + graph.clone(), + image_shape[0], + image_shape[1], + image_shape[2], + image_shape[3], + d_patch, + handle, + pages, + false, + ) + }) + }); + // 建立映射 + model.map(pages); + model + } } fn builder() -> GraphBuilder { let mut ans = GraphBuilder::default(); ans.register_op("embedding", op::embedding::Embedding) + .register_op("add", op::add::Add) + .register_op("conv", op::conv::Conv) + .register_op("layer-norm", op::normalization::LayerNorm) .register_op("rms-norm", op::normalization::RmsNorm) .register_op("linear", op::linear::Linear) .register_op("rope", op::rope::Rope) + .register_op("mrope", op::mrope::Mrope) .register_op("attention", op::attention::Attention) + .register_op("gelu", op::activation::GeLU) .register_op("swiglu", op::activation::SwiGLU) .register_op("concat", op::concat::Concat) .register_op("split", op::split::Split) .register_op("tile", op::tile::Tile) .register_op("merge", op::merge::Merge) + .register_op("transpose", op::transpose::Transpose) .register_op("all-reduce", op::all_reduce::AllReduce); ans } diff --git a/llama.cu/src/exec/mod.rs b/llama.cu/src/exec/mod.rs index 0da449e4..5ca62256 100644 --- a/llama.cu/src/exec/mod.rs +++ b/llama.cu/src/exec/mod.rs @@ -1,4 +1,5 @@ -mod engine; +mod encoder; +mod engine; mod engine_manager; mod group; mod kv_cache; diff --git a/llama.cu/src/exec/model.rs b/llama.cu/src/exec/model.rs index 7ff744e5..f1f32a1c 100644 --- a/llama.cu/src/exec/model.rs +++ b/llama.cu/src/exec/model.rs @@ -9,8 +9,11 @@ use bytesize::ByteSize; use log::trace; use nn::{NNGraph, Tensor}; use operators::{ + attention::common_cpu::Operator as AttnCpu, attention_kv_cached::cuda::Operator as Attn, + conv::cuda::ConvIm2Col, cuda::{DevByte, Stream, VirByte, VirMem}, + rearrange::cuda::Operator as Rearrange, }; use std::time::Instant; @@ -77,6 +80,79 @@ impl<'ctx> ModelExec<'ctx> { outputs, } } + + #[allow(dead_code)] + #[allow(clippy::too_many_arguments)] + pub fn new_qw2vl( + graph: NNGraph>, + n_img: usize, + channels: usize, + h_img: usize, + w_img: usize, + d_patch: usize, + handle: &mut Handle<'ctx>, + pages: &mut MemPages, + use_cuda_graph: bool, + ) -> Self { + let patches = (h_img / d_patch) * (w_img / d_patch); + let graph = graph.lower( + &[ + ("n_img", n_img), + ("channels", channels), + ("h_img", h_img), + ("w_img", w_img), + ("patches", patches), + ] + .into(), + |t| t, + ); + + let mem_range_map = graph.mem_range_map(8 << 30, 512); + + let mut workspace = pages.reserve_vir(mem_range_map.range.len()); + let ptr = workspace.as_ptr(); + let graph = graph.lower( + |key| unsafe { ptr.byte_add(mem_range_map.map[&key].start) }, + |&data| data, + ); + let inputs: Box<[Tensor<*const VirByte, 2>]> = graph + .0 + .topo + .global_inputs() + .map(|i| graph.0.edges[i].clone()) + .collect::>(); + let outputs = graph + .0 + .topo + .global_outputs() + .iter() + .map(|&i| graph.0.edges[i].clone()) + .collect::>(); + let exec = graph.into_exec(); + + // memcpy node 要求当时虚地址有对应的物理页 + pages.map(&mut workspace, ..); + + // 构造 cuda graph + let time = Instant::now(); + let execs = handle.build_steps(exec, use_cuda_graph); + trace!( + "model compiled @{} in {:.2?}, patches = {patches}, workspace = {}", + handle.ctx.dev().index(), + time.elapsed(), + ByteSize::b(workspace.len() as _).display(), + ); + + // 解除映射回收物理页 + pages.unmap(&mut workspace, ..); + + Self { + execs, + workspace, + inputs, + outputs, + } + } } impl ModelExec<'_> { @@ -119,6 +195,41 @@ impl ModelExec<'_> { } Step::Attention(box_) => handle.launch_attn(attn, box_, reqs, stream), Step::Exec(exec) => handle.launch_nn_exec(exec, stream), + Step::Rearrange(_) => unreachable!(), + Step::Conv(_) => unreachable!(), + } + } + destruct!([x] = self.outputs.clone()); + x + } + + #[allow(dead_code)] + pub fn launch_qw2vl( + &mut self, + attn: &AttnCpu, + conv: &ConvIm2Col, + rearrange: &Rearrange, + handle: &mut Handle, + reqs: &[Req>], + stream: &Stream, + ) -> Tensor<*const VirByte, 2> { + // 执行 + for exec in &self.execs { + stream.synchronize(); + match exec { + Step::Graph(graph, stub) => { + stream.launch_graph(graph); + if !stub.is_empty() { + for t in stub { + utils::fmt(t, stream.ctx()) + } + std::process::exit(0); + } + } + Step::Attention(box_) => handle.launch_attn_qw2vl(attn, box_, reqs, stream), + Step::Rearrange(box_) => handle.launch_rearrange(rearrange, box_, reqs, stream), + Step::Conv(box_) => handle.launch_conv(conv, box_, reqs, stream), + Step::Exec(exec) => handle.launch_nn_exec(exec, stream), } } destruct!([x] = self.outputs.clone()); diff --git a/llama.cu/src/exec/step.rs b/llama.cu/src/exec/step.rs index 8fc10632..12819e5f 100644 --- a/llama.cu/src/exec/step.rs +++ b/llama.cu/src/exec/step.rs @@ -2,23 +2,44 @@ batch::Req, handle::Handle, op::{self, Operator as _}, - utils::{destruct, layout, offset_ptr}, + utils::{Blob, destruct, layout, offset_ptr}, }; +use ndarray_layout::{ArrayLayout, Endian}; use nn::{Arg, Named, Tensor}; use operators::{ - Operator as _, + Operator as _, TensorLayout, + attention::Args as AttnNoKvArgs, + attention::common_cpu::Operator as AttnCpu, attention_kv_cached::{Args as AttnArgs, cuda::Operator as Attn}, - cuda::{CaptureStream, GraphExec, Stream, VirByte}, + conv::{Args as ConvArgs, cuda::ConvIm2Col}, + cuda::{CaptureStream, DevByte, GraphExec, Stream, VirByte, memcpy_d2h, memcpy_h2d}, + rearrange::{Args as RearrArgs, cuda::Operator as Rearr}, }; use regex::Regex; use std::{fmt, sync::LazyLock}; +#[allow(dead_code)] pub(super) enum Step<'ctx> { Graph(GraphExec<'ctx>, Box<[Tensor<*const VirByte, 2>]>), Attention(Box), + Rearrange(Box), + Conv(Box), Exec(nn::Exec<*const VirByte>), } +pub(super) struct Rearrange { + pub dst: Tensor<*const VirByte, 2>, + pub src: Tensor<*const VirByte, 2>, +} + +pub(super) struct Conv { + pub y: Tensor<*const VirByte, 2>, + pub x: Tensor<*const VirByte, 2>, + pub w: Tensor<*const VirByte, 2>, + pub b: Option>, + pub d_patch: usize, +} + pub(super) struct Attention { pub iblk: usize, pub q: Tensor<*const VirByte, 2>, @@ -79,6 +100,61 @@ impl<'ctx> Handle<'ctx> { exec_.push(Step::Attention(Box::new(Attention { iblk, q, k, v, o }))); continue; } + if exec.node.value.name == "merge" { + if let Some(stream) = stream.take() { + exec_.push(Step::Graph( + self.ctx.instantiate(&stream.end()), + Default::default(), + )) + } + + let nn::Exec { + node: Named { name: _, value: _ }, + inputs, + outputs, + } = exec; + + destruct!([src] = inputs); + destruct!([dst] = outputs); + + exec_.push(Step::Rearrange(Box::new(Rearrange { dst, src }))); + continue; + } + if exec.node.value.name == "conv" { + if let Some(stream) = stream.take() { + exec_.push(Step::Graph( + self.ctx.instantiate(&stream.end()), + Default::default(), + )) + } + + let nn::Exec { + node: Named { name: _, value: op }, + inputs, + outputs, + } = exec; + + let Some(nn::Arg::Bool(bias)) = op.arg else { + panic!() + }; + let (x, w, b) = if bias { + destruct!([x, w, b] = inputs); + (x, w, Some(b)) + } else { + destruct!([x, w] = inputs); + (x, w, None) + }; + destruct!([y] = outputs); + + exec_.push(Step::Conv(Box::new(Conv { + y, + x, + w, + b, + d_patch: 14, // todo: from model + }))); + continue; + } if use_cuda_graph { self.launch_nn_exec( &exec, @@ -120,6 +196,11 @@ impl<'ctx> Handle<'ctx> { "rms-norm" => launch!(RmsNorm), "layer-norm" => launch!(LayerNorm), "linear" => launch!(Linear), + "add" => match inputs[0].shape().len() { + 2 => launch!(Add), + 4 => launch!(Add4d), + _ => panic!("add: unsupported shape"), + }, "rope" => launch!(Rope), "mrope" => launch!(MRope), "gelu" => launch!(Gelu), @@ -185,6 +266,125 @@ impl<'ctx> Handle<'ctx> { .unwrap() } } + + pub(super) fn launch_attn_qw2vl( + &mut self, + op: &AttnCpu, + attn: &Attention, + _reqs: &[Req>], + _stream: &Stream, + ) { + let Attention { + iblk: _, + q, + k, + v, + o, + } = attn; + + // d2h + let d2h = |tensor: &Tensor<*const VirByte, 2>| { + let mem_range = tensor.layout().data_range(); + let ptr = tensor.get().cast::(); + let len = *mem_range.end() as usize + tensor.dt().nbytes(); + let slice = unsafe { std::slice::from_raw_parts(ptr, len) }; + let mut host = Blob::new(len); + memcpy_d2h(&mut host, slice); + tensor.as_ref().map(|_| host) + }; + let (q_host, k_host, v_host, o_host) = (d2h(q), d2h(k), d2h(v), d2h(o)); + let [q_, k_, v_, o_] = [&q_host, &k_host, &v_host, &o_host] + .map(|h| h.as_deref().map(|t| t.as_ptr() as *const VirByte)); + + // cpu cal + op.launch( + &AttnNoKvArgs { + q_layout: layout(q), + q_base: offset_ptr(&q_).cast_mut().cast(), + k_layout: layout(k), + k_base: offset_ptr(&k_).cast(), + v_layout: layout(v), + v_base: offset_ptr(&v_).cast(), + o_layout: layout(o), + o_base: offset_ptr(&o_).cast_mut().cast(), + mask: operators::fuesd_softmax::AttnMask::None, + }, + &mut [], + &operators::common_cpu::ThisThread, + ) + .unwrap(); + + // h2d + let h2d = |tensor: &Tensor<*const VirByte, 2>, host: &Tensor<*const VirByte, 2>| { + let mem_range = tensor.layout().data_range(); + let ptr = tensor.get().cast::().cast_mut(); + let len = *mem_range.end() as usize + tensor.dt().nbytes(); + let host = unsafe { std::slice::from_raw_parts(host.get().cast::(), len) }; + let dev = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + memcpy_h2d(dev, host); + }; + h2d(o, &o_); + } + + pub(super) fn launch_conv( + &mut self, + op: &ConvIm2Col, + conv: &Conv, + _reqs: &[Req>], + stream: &Stream, + ) { + let Conv { + y, + x, + w, + b, + d_patch, + } = conv; + + op.launch( + &ConvArgs { + y_layout: layout(y), + y_base: offset_ptr(y).cast_mut().cast(), + x_layout: layout(x), + x_base: offset_ptr(x).cast(), + w_layout: layout(w), + w_base: offset_ptr(w).cast(), + b_layout: b.as_ref().map(layout), + b_base: b.as_ref().map(|b| offset_ptr(b).cast()), + strides: [*d_patch; 2], + dilations: [1; 2], + pads: [0; 4], + }, + &mut [], + stream, + ) + .unwrap() + } + + pub(super) fn launch_rearrange( + &mut self, + op: &Rearr, + rearrange: &Rearrange, + _reqs: &[Req>], + stream: &Stream, + ) { + let Rearrange { dst, src } = rearrange; + op.launch( + &RearrArgs { + dst_layout: TensorLayout { + dt: src.dt(), + layout: ArrayLayout::<2>::new_contiguous(src.shape(), Endian::BigEndian, 2) + .to_inline_size(), + }, + dst_base: offset_ptr(dst).cast_mut().cast(), + src_layout: layout(src), + src_base: offset_ptr(src).cast(), + }, + &mut [], + stream, + ) + .unwrap(); + } } struct ErrorFmt<'a> { diff --git a/llama.cu/src/model/image.rs b/llama.cu/src/model/image.rs new file mode 100644 index 00000000..8380fa5f --- /dev/null +++ b/llama.cu/src/model/image.rs @@ -0,0 +1,188 @@ +use half::f16; +use image::{DynamicImage, GenericImageView, RgbImage}; +use mem_rearrange::Rearranging; +use ndarray::{Array3, Array4, Axis}; +use ndarray_layout::{ArrayLayout, Endian}; +use nn::{Tensor, digit_layout::types}; +use std::{env::var_os, path::PathBuf}; + +fn cubic_kernel(x: f32) -> f32 { + let abs_x = x.abs(); + if abs_x < 1.0 { + (1.5 * abs_x.powi(3)) - (2.5 * abs_x.powi(2)) + 1.0 + } else if abs_x < 2.0 { + (-0.5 * abs_x.powi(3)) + (2.5 * abs_x.powi(2)) - (4.0 * abs_x) + 2.0 + } else { + 0.0 + } +} + +fn bicubic_resize(input: &RgbImage, out_w: u32, out_h: u32) -> RgbImage { + let (in_w, in_h) = input.dimensions(); + let mut out = RgbImage::new(out_w, out_h); + for y in 0..out_h { + let fy = (y as f32 + 0.5) * (in_h as f32 / out_h as f32) - 0.5; + let y_int = fy.floor() as i32; + let y_frac = fy - y_int as f32; + for x in 0..out_w { + let fx = (x as f32 + 0.5) * (in_w as f32 / out_w as f32) - 0.5; + let x_int = fx.floor() as i32; + let x_frac = fx - x_int as f32; + let mut rgb = [0.0f32; 3]; + for m in -1..=2 { + let wy = cubic_kernel(m as f32 - y_frac); + let sy = y_int + m; + if sy < 0 || sy >= in_h as i32 { + continue; + } + for n in -1..=2 { + let wx = cubic_kernel(x_frac - n as f32); + let sx = x_int + n; + if sx < 0 || sx >= in_w as i32 { + continue; + } + let pixel = input.get_pixel(sx as u32, sy as u32); + for c in 0..3 { + rgb[c] += pixel[c] as f32 * wx * wy; + } + } + } + let pixel = image::Rgb([ + rgb[0].clamp(0.0, 255.0) as u8, + rgb[1].clamp(0.0, 255.0) as u8, + rgb[2].clamp(0.0, 255.0) as u8, + ]); + out.put_pixel(x, y, pixel); + } + } + out +} + +fn normalize(img: &RgbImage, mean: [f32; 3], std: [f32; 3]) -> Array3 { + let (w, h) = img.dimensions(); + let mut arr = Array3::::zeros((3, h as usize, w as usize)); + for (x, y, pixel) in img.enumerate_pixels() { + for c in 0..3 { + let val = pixel[c] as f32 / 255.0; + arr[[c, y as usize, x as usize]] = (val - mean[c]) / std[c]; + } + } + arr +} + +fn smart_resize( + height: u32, + width: u32, + factor: u32, + min_pixels: u32, + max_pixels: u32, +) -> (u32, u32) { + let height = height as f32; + let width = width as f32; + let factor = factor as f32; + let min_pixels = min_pixels as f32; + let max_pixels = max_pixels as f32; + + if height < factor || width < factor { + panic!("height:{height} or width:{width} must be larger than factor:{factor}"); + } else if (height.max(width) / height.min(width)) > 200.0 { + panic!( + "absolute aspect ratio must be smaller than 200, got {}", + height.max(width) / height.min(width) + ); + } + + let mut h_bar = (height / factor).round() * factor; + let mut w_bar = (width / factor).round() * factor; + if h_bar * w_bar > max_pixels { + let beta = ((height * width) / max_pixels).sqrt(); + h_bar = ((height / beta) / factor).floor() * factor; + w_bar = ((width / beta) / factor).floor() * factor; + } else if h_bar * w_bar < min_pixels { + let beta = (min_pixels / (height * width)).sqrt(); + h_bar = ((height * beta) / factor).ceil() * factor; + w_bar = ((width * beta) / factor).ceil() * factor; + } + + (h_bar as u32, w_bar as u32) +} + +fn preprocess_image_for_qw2vl( + img: &DynamicImage, + image_mean: [f32; 3], + image_std: [f32; 3], +) -> Array4 { + let (in_w, in_h) = img.dimensions(); + let patch_size = 14; + let factor = patch_size * 2; + let min_pixels = 56 * 56; + let max_pixels = 14 * 14 * 4 * 1280; + let (out_h, out_w) = smart_resize(in_h, in_w, factor, min_pixels, max_pixels); + let rgb = img.to_rgb8(); + let resized = bicubic_resize(&rgb, out_w, out_h); + let arr = normalize(&resized, image_mean, image_std); // (3, H, W) + let arr_f16 = arr.mapv(f16::from_f32); + arr_f16.insert_axis(Axis(0)) // (1, 3, H, W) +} + +#[allow(dead_code)] +pub(crate) fn image_from_env() -> PathBuf { + let Some(img) = var_os("TEST_IMAGE").map(PathBuf::from) else { + panic!("TEST_IMAGE not set"); + }; + img +} + +pub(crate) fn qw2vl_image_preprocess( + image: PathBuf, + image_mean: [f32; 3], + image_std: [f32; 3], +) -> Tensor, 2> { + use std::time::Instant; + let time = Instant::now(); + // image preprocess + let buf = std::fs::read(&image).unwrap(); + let img = image::load_from_memory(&buf).unwrap(); + println!("load image {:?}", time.elapsed()); + let arr = preprocess_image_for_qw2vl(&img, image_mean, image_std); + println!("image preprocess {:?}", time.elapsed()); + let shape = arr.shape().to_vec(); + let strides = arr.strides().to_vec(); + let offset = 0_isize; + let arr = unsafe { + std::slice::from_raw_parts(arr.as_ptr() as *const u8, arr.len() * size_of::()) + }; + // rearrange + let shape = <[usize; 4]>::try_from(shape).unwrap(); + let src_strides = <[isize; 4]>::try_from(strides) + .unwrap() + .map(|x| x * size_of::() as isize); + let src_layout = ArrayLayout::<2>::new(&shape, &src_strides, offset); + let dst_layout = ArrayLayout::<2>::new_contiguous(&shape, Endian::BigEndian, 2); + let scheme = Rearranging::new(&dst_layout, &src_layout, 2).unwrap(); + let binding = vec![0u8; arr.len()]; + let image = binding.as_slice(); + let dst = image.as_ptr() as *mut u8; + let src = arr.as_ptr(); + unsafe { scheme.launch(dst, src) }; + // return tensor + Tensor::from_raw_parts(types::F16, dst_layout, image.to_vec()) +} + +// #[test] +fn _test_qw2vl_image_preprocess() { + let image = image_from_env(); + let image_mean: [f32; 3] = [0.481_454_66, 0.457_827_5, 0.408_210_73]; + let image_std: [f32; 3] = [0.268_629_54, 0.261_302_6, 0.275_777_1]; + let image = qw2vl_image_preprocess(image, image_mean, image_std); + let img = unsafe { + std::slice::from_raw_parts( + image.get().as_ptr() as *const f16, + image.get().len() / size_of::(), + ) + }; + println!("image shape: {:?}", image.shape()); + println!("image strides: {:?}", image.strides()); + println!("image offset: {:?}", image.offset()); + println!("image tensor: {img:?}"); +} diff --git a/llama.cu/src/model/mod.rs b/llama.cu/src/model/mod.rs index 67987b42..586f0508 100644 --- a/llama.cu/src/model/mod.rs +++ b/llama.cu/src/model/mod.rs @@ -1,6 +1,7 @@ mod chat_template; +pub(crate) mod image; mod llama; -mod qw2vl_mmproj; +pub(crate) mod qw2vl_mmproj; use crate::utils::{Blob, Data}; use ggus::{ diff --git a/llama.cu/src/model/qw2vl_mmproj.rs b/llama.cu/src/model/qw2vl_mmproj.rs index 41ce1cb8..e0391565 100644 --- a/llama.cu/src/model/qw2vl_mmproj.rs +++ b/llama.cu/src/model/qw2vl_mmproj.rs @@ -8,15 +8,16 @@ use nn::{ impl GGufModel<'_> { /// 构造 qw2vl_mmproj 模型 - pub fn _qw2vl_mmproj(&self) -> nn::Qwen2VLmmproj> { - let nblk = meta![self => llm_block_count]; - let d = meta![self => llm_embedding_length]; - let nh = meta![self => llm_attention_head_count]; + #[allow(dead_code)] + pub fn qw2vl_mmproj(&self, nctx: usize) -> nn::Qwen2VLmmproj> { + let nblk = meta![self => llm_block_count; 32]; + let d = meta![self => llm_embedding_length;1280]; + let nh = meta![self => llm_attention_head_count;16]; let nkvh = meta![self => llm_attention_head_count_kv; nh]; let dh = meta![self => llm_rope_dimension_count; d / nh]; - let _di = meta![self => llm_feed_forward_length]; + let _di = meta![self => llm_feed_forward_length; 5120]; let epsilon = meta![self => llm_attention_layer_norm_epsilon; 1e-6]; - let d_patch = 14; // ggus todo + let d_patch = 14; // todo: ggus let d_proj = 1536; let dt = self.tensors["v.blk.0.attn_qkv.weight"].dt(); let dt_norm = self.tensors["v.blk.0.ln1.weight"].dt(); @@ -50,13 +51,13 @@ impl GGufModel<'_> { dt, [(nh + nkvh + nkvh) * dh, d], get(&format!("v.blk.{iblk}.attn_qkv.weight")), - Some((dt_norm, get(&format!("v.blk.{iblk}.attn_qkv.bias")))), + Some((dt, get(&format!("v.blk.{iblk}.attn_qkv.bias")))), ), q_norm: None, k_norm: None, rope: Some(RoPE { multimodal: true, - nctx: 34, // todo: from image + nctx, sin: get("sin_table"), cos: get("cos_table"), }), @@ -64,7 +65,7 @@ impl GGufModel<'_> { dt, [d, nh * dh], get(&format!("v.blk.{iblk}.attn_out.weight")), - Some((dt_norm, get(&format!("v.blk.{iblk}.attn_out.bias")))), + Some((dt, get(&format!("v.blk.{iblk}.attn_out.bias")))), ), }, Normalization { @@ -82,14 +83,14 @@ impl GGufModel<'_> { dt, [d * 4, d], get(&format!("v.blk.{iblk}.ffn_up.weight")), - Some((dt_norm, get(&format!("v.blk.{iblk}.ffn_up.bias")))), + Some((dt, get(&format!("v.blk.{iblk}.ffn_up.bias")))), ), act: Activation::GeLU, down: Linear::new( dt, [d, d * 4], get(&format!("v.blk.{iblk}.ffn_down.weight")), - Some((dt_norm, get(&format!("v.blk.{iblk}.ffn_down.bias")))), + Some((dt, get(&format!("v.blk.{iblk}.ffn_down.bias")))), ), }, ) @@ -111,14 +112,14 @@ impl GGufModel<'_> { dt, [d * 4, d * 4], get("mm.0.weight"), - Some((dt_norm, get("mm.0.bias"))), + Some((dt, get("mm.0.bias"))), ), act: Activation::GeLU, down: Linear::new( dt, [d_proj, d * 4], get("mm.2.weight"), - Some((dt_norm, get("mm.2.bias"))), + Some((dt, get("mm.2.bias"))), ), }, }, @@ -126,11 +127,11 @@ impl GGufModel<'_> { } /// 插入用于 MRoPE 的 sin cos 表张量 - pub fn _insert_sin_cos_qw2vl(&mut self) { - let nctx = meta![self => llm_context_length; 34]; // todo: from image - let d = meta![self => llm_embedding_length]; - let nh = meta![self => llm_attention_head_count]; - let dh = meta![self => llm_rope_dimension_count; d / nh]; + #[allow(dead_code)] + pub(crate) fn insert_sin_cos_qw2vl(&mut self, nctx: usize) { + let d = meta![self => llm_embedding_length; 1280]; + let nh = meta![self => llm_attention_head_count; 16]; + let dh = meta![self => llm_rope_dimension_count; d / nh]; // todo: ggus let dh_div_2 = dh / 2; // h, w 维度均分 dh_div_2 let theta = meta![self => llm_rope_freq_base; 1e4]; let [sin, cos] = build_sin_cos(nctx, dh_div_2, theta, |pos, _| pos as _); @@ -140,7 +141,8 @@ impl GGufModel<'_> { } /// 构造 pos_ids 表 -pub fn _build_pos_ids(h: usize, w: usize, d_patch: usize) -> Vec { +#[allow(dead_code)] +pub(crate) fn build_pos_ids(h: usize, w: usize, d_patch: usize) -> Vec { let hp = h / d_patch; let wp = w / d_patch; let mut pos = vec![0; hp * wp * 2]; diff --git a/llama.cu/src/op/add4d.cuh b/llama.cu/src/op/add4d.cuh new file mode 100644 index 00000000..38cc9050 --- /dev/null +++ b/llama.cu/src/op/add4d.cuh @@ -0,0 +1,27 @@ +template +static __device__ void kernel( + Tdata *__restrict__ y, + int const sny, + int const smy, + int const shy, + int const swy, + Tdata const *__restrict__ x, + int const snx, + int const smx, + int const shx, + int const swx, + Tdata const *__restrict__ b, + int const snb, + int const smb, + int const shb, + int const swb) { + auto in = blockIdx.x, + im = blockIdx.y, + ih = blockIdx.z, + iw = threadIdx.x, + iy = in * sny + im * smy + ih * shy + iw * swy, + ix = in * snx + im * smx + ih * shx + iw * swx, + ib = in * snb + im * smb + ih * shb + iw * swb; + + y[iy] = Tdata(x[ix] + b[ib]); +} diff --git a/llama.cu/src/op/add4d.rs b/llama.cu/src/op/add4d.rs new file mode 100644 index 00000000..c8fd7135 --- /dev/null +++ b/llama.cu/src/op/add4d.rs @@ -0,0 +1,108 @@ +use super::{Handle, ModuleKey, Operator, cuda_type}; +use crate::utils::{destruct, dims, offset_ptr, strides}; +use nn::{Tensor, digit_layout::DigitLayout}; +use operators::cuda::{Stream, VirByte, params}; +use std::ffi::{c_int, c_uint}; + +pub struct Add4d; + +impl Operator for Add4d { + fn launch<'a, const N: usize>( + handle: &mut Handle, + arg: Option, + inputs: impl IntoIterator>, + outputs: impl IntoIterator>, + stream: &Stream, + ) { + assert!(arg.is_none()); + + destruct!([y] = outputs); + destruct!([x, b] = inputs); + // 检查维度 + dims!([n, m, hp, wp] = y); + dims!([n2, m2, hp2, wp2] = x); + dims!([n3, m3, hp3, wp3] = b); + + assert_eq!(n, n2); + assert_eq!(n, n3); + assert_eq!(m, m2); + assert_eq!(m, m3); + assert_eq!(hp, hp2); + assert_eq!(hp, hp3); + assert_eq!(wp, wp2); + assert_eq!(wp, wp3); + // 检查类型 + let dt = y.dt(); + assert_eq!(x.dt(), dt); + assert_eq!(b.dt(), dt); + // 获取 stride + strides!([sny, smy, shy, swy] = y); + strides!([snx, smx, shx, swx] = x); + strides!([snb, smb, shb, swb] = b); + // 获取最大线程数 + let max_threads_block = handle.ctx.dev().block_limit().max_threads; + // 编译内核 + let key = [ModuleKey::Text("add4d"), ModuleKey::Type(dt)].into_iter(); + let module = handle.compile(key.collect(), || code(dt)); + let kernel = module.get_kernel(c"add4d"); + // 准备参数 + let unit = dt.nbytes() as isize; + let params = params![ + offset_ptr(&y), + (sny / unit) as c_int, + (smy / unit) as c_int, + (shy / unit) as c_int, + (swy / unit) as c_int, + offset_ptr(&x), + (snx / unit) as c_int, + (smx / unit) as c_int, + (shx / unit) as c_int, + (swx / unit) as c_int, + offset_ptr(&b), + (snb / unit) as c_int, + (smb / unit) as c_int, + (shb / unit) as c_int, + (swb / unit) as c_int + ]; + // 计算线程块配置 + assert!(wp <= max_threads_block); + // 启动内核 + // gridDim = (n, m, hp) + stream.launch( + &kernel, + ((hp as c_uint, m as c_uint, n as c_uint), wp as c_uint, 0), + ¶ms.to_ptrs(), + ); + } +} + +fn code(dt: DigitLayout) -> String { + const CODE: &str = include_str!("add4d.cuh"); + let dt = cuda_type(dt); + + format!( + r#"{CODE} + +extern "C" __global__ void add4d( + {dt} *__restrict__ y, + int const sny, + int const smy, + int const shy, + int const swy, + {dt} const *__restrict__ x, + int const snx, + int const smx, + int const shx, + int const swx, + {dt} const *__restrict__ b, + int const snb, + int const smb, + int const shb, + int const swb +){{ + kernel(y, sny, smy, shy, swy, + x, snx, smx, shx, swx, + b, snb, smb, shb, swb); +}}"# + ) +} diff --git a/llama.cu/src/op/gelu.rs b/llama.cu/src/op/gelu.rs index 1a468a92..322c3108 100644 --- a/llama.cu/src/op/gelu.rs +++ b/llama.cu/src/op/gelu.rs @@ -75,7 +75,7 @@ fn code(dt: DigitLayout) -> String { extern "C" __global__ void gelu( {dt} *__restrict__ out, - {dt} const *__restrict__ data, + {dt} const *__restrict__ data ){{ kernel(out, data); }}"# diff --git a/llama.cu/src/op/mod.rs b/llama.cu/src/op/mod.rs index 147e040b..0138700a 100644 --- a/llama.cu/src/op/mod.rs +++ b/llama.cu/src/op/mod.rs @@ -1,4 +1,5 @@ mod add; +mod add4d; #[cfg(nccl)] mod all_reduce; mod embedding; @@ -20,6 +21,8 @@ use operators::cuda::{Stream, VirByte}; pub mod random_sample; +pub use add::Add; +pub use add4d::Add4d; #[cfg(nccl)] pub use all_reduce::AllReduce; pub use embedding::Embedding; diff --git a/llama.cu/src/op/mrope.rs b/llama.cu/src/op/mrope.rs index 867eeaf6..cace3fba 100644 --- a/llama.cu/src/op/mrope.rs +++ b/llama.cu/src/op/mrope.rs @@ -26,17 +26,17 @@ impl Operator for MRope { //检查dim dims!([n, dh_mut_dhead] = x); dims!([n2, _dim] = pos); // dim 维 pos_ids - dims!([nctx, dh_2] = sin); - dims!([nctx2, dh_2_] = cos); + dims!([nctx, dh_4] = sin); + dims!([nctx2, dh_4_] = cos); dims!([n3, dh_mut_dhead_] = y); assert_eq!(n, n2); assert_eq!(n, n3); assert_eq!(dh_mut_dhead, dh_mut_dhead_); - assert_eq!(dh_2, dh_2_); + assert_eq!(dh_4, dh_4_); assert_eq!(nctx, nctx2); - let dh = dh_2 * 2; + let dh = dh_4 * 4; let d_head = dh_mut_dhead / dh; assert_eq!(dh_mut_dhead % dh, 0); @@ -99,8 +99,8 @@ impl Operator for MRope { stream.launch( &kernel, ( - (nh_h as c_uint, n as c_uint), - (dh_div_2 as c_uint, nh_l as c_uint), + (n as c_uint, nh_h as c_uint), + (nh_l as c_uint, dh_div_2 as c_uint), 0, ), ¶ms.to_ptrs(),