This is an automated email from the ASF dual-hosted git repository. guanmingchiu pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/mahout.git
commit 7174f3899091420b41d50a897ab7c52795f2ebe1 Author: Guan-Ming (Wesley) Chiu <[email protected]> AuthorDate: Fri Dec 5 23:14:54 2025 +0800 [QDP] Integrate Apache Arrow and Parquet for data processing (#680) * Integrate Apache Arrow & Parquet for data processing * Optimize Arrow Float64Array handling in io * Add chunked Arrow Float64Array support * Refactor encoding to support chunked Arrow Float64Array input * Refactor I/O and encoding documentation to remove zero-copy --- qdp/Cargo.lock | 1164 ++++++++++++++++++++++++++++++++- qdp/Cargo.toml | 4 + qdp/qdp-core/Cargo.toml | 2 + qdp/qdp-core/src/error.rs | 3 + qdp/qdp-core/src/gpu/encodings/mod.rs | 15 + qdp/qdp-core/src/io.rs | 272 ++++++++ qdp/qdp-core/src/lib.rs | 48 ++ qdp/qdp-core/tests/parquet_io.rs | 163 +++++ 8 files changed, 1640 insertions(+), 31 deletions(-) diff --git a/qdp/Cargo.lock b/qdp/Cargo.lock index 5d8cedd4c..c3694bab9 100644 --- a/qdp/Cargo.lock +++ b/qdp/Cargo.lock @@ -2,12 +2,333 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "const-random", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "arrow" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5ec52ba94edeed950e4a41f75d35376df196e8cb04437f7280a5aa49f20f796" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc766fdacaf804cb10c7c70580254fcdb5d55cdfda2bc57b02baf5223a3af9e" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num", +] + +[[package]] +name = "arrow-array" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a12fcdb3f1d03f69d3ec26ac67645a8fe3f878d77b5ebb0b15d64a116c212985" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "hashbrown 0.15.5", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "263f4801ff1839ef53ebd06f99a56cecd1dbaf314ec893d93168e2e860e0291c" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ede6175fbc039dfc946a61c1b6d42fd682fcecf5ab5d148fbe7667705798cac9" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1644877d8bc9a0ef022d9153dc29375c2bda244c39aec05a91d0e87ccf77995f" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "lazy_static", + "regex", +] + +[[package]] +name = "arrow-data" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61cfdd7d99b4ff618f167e548b2411e5dd2c98c0ddebedd7df433d34c20a4429" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62ff528658b521e33905334723b795ee56b393dbe9cf76c8b1f64b648c65a60c" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-json" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ee5b4ca98a7fb2efb9ab3309a5d1c88b5116997ff93f3147efdc1062a6158e9" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap", + "lexical-core", + "memchr", + "num", + "serde", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0a3334a743bd2a1479dbc635540617a3923b4b2f6870f37357339e6b5363c21" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-row" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d1d7a7291d2c5107e92140f75257a99343956871f3d3ab33a7b41532f79cb68" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cfaf5e440be44db5413b75b72c2a87c1f8f0627117d110264048f2969b99e9" + +[[package]] +name = "arrow-select" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69efcd706420e52cd44f5c4358d279801993846d1c2a8e52111853d61d55a619" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21546b337ab304a32cfc0770f671db7411787586b45b78b4593ae78e64e2b03" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + [[package]] name = "autocfg" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "brotli" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "4.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a334ef7c9e23abf0ce748e8cd309037da93e606ad52eb372e4ce327a0dcfbdfd" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" + [[package]] name = "cc" version = "1.2.48" @@ -15,6 +336,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c481bdbf0ed3b892f6f806287d72acd515b352a4ec27a208489b8c1bc839633a" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] @@ -24,6 +347,52 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "chrono" +version = "0.4.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +dependencies = [ + "iana-time-zone", + "num-traits", + "windows-link", +] + +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.16", + "once_cell", + "tiny-keccak", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam-deque" version = "0.8.6" @@ -44,70 +413,410 @@ dependencies = [ ] [[package]] -name = "crossbeam-utils" -version = "0.8.21" +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + +[[package]] +name = "cudarc" +version = "0.13.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "486c221362668c63a1636cfa51463b09574433b39029326cff40864b3ba12b6e" +dependencies = [ + "libloading", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "find-msvc-tools" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" + +[[package]] +name = "flatbuffers" +version = "24.12.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" +dependencies = [ + "bitflags", + "rustc_version", +] + +[[package]] +name = "flate2" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "getrandom" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", + "zerocopy", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "iana-time-zone" +version = "0.1.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "indexmap" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", +] + +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "libc" +version = "0.2.177" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" + +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + +[[package]] +name = "libm" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "lz4_flex" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" +dependencies = [ + "twox-hash 2.1.2", +] + +[[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + +[[package]] +name = "memoffset" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] [[package]] -name = "cudarc" -version = "0.13.9" +name = "miniz_oxide" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "486c221362668c63a1636cfa51463b09574433b39029326cff40864b3ba12b6e" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ - "libloading", + "adler2", + "simd-adler32", ] [[package]] -name = "either" -version = "1.15.0" +name = "num" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] [[package]] -name = "find-msvc-tools" -version = "0.1.5" +name = "num-bigint" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] [[package]] -name = "heck" -version = "0.5.0" +name = "num-complex" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] [[package]] -name = "indoc" -version = "2.0.7" +name = "num-integer" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" dependencies = [ - "rustversion", + "num-traits", ] [[package]] -name = "libc" -version = "0.2.177" +name = "num-iter" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] [[package]] -name = "libloading" -version = "0.8.9" +name = "num-rational" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" dependencies = [ - "cfg-if", - "windows-link", + "num-bigint", + "num-integer", + "num-traits", ] [[package]] -name = "memoffset" -version = "0.9.1" +name = "num-traits" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -125,6 +834,60 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + +[[package]] +name = "parquet" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfb15796ac6f56b429fd99e33ba133783ad75b27c36b4b5ce06f1f82cc97754e" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "half", + "hashbrown 0.15.5", + "lz4_flex", + "num", + "num-bigint", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "twox-hash 1.6.3", + "zstd", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + [[package]] name = "portable-atomic" version = "1.11.1" @@ -205,8 +968,10 @@ dependencies = [ name = "qdp-core" version = "0.1.0" dependencies = [ + "arrow", "cudarc", "nvtx", + "parquet", "qdp-kernels", "rayon", "thiserror", @@ -237,6 +1002,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "rayon" version = "1.11.0" @@ -257,18 +1028,140 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "regex" +version = "1.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustversion" version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.145" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", + "serde_core", +] + [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "syn" version = "2.0.111" @@ -306,6 +1199,42 @@ dependencies = [ "syn", ] +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" + [[package]] name = "unicode-ident" version = "1.0.22" @@ -318,8 +1247,181 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.1+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "wit-bindgen" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" + +[[package]] +name = "zerocopy" +version = "0.8.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd74ec98b9250adb3ca554bdde269adf631549f51d8a8f8f0a10b50f1cb298c3" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a8d209fdf45cf5138cbb5a506f6b52522a25afccc534d1475dad8e31105c6a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/qdp/Cargo.toml b/qdp/Cargo.toml index 0411c8eb3..b934a5559 100644 --- a/qdp/Cargo.toml +++ b/qdp/Cargo.toml @@ -24,6 +24,10 @@ cc = "1.2" thiserror = "2.0" # Parallel computing (for CPU preprocessing) rayon = "1.10" +# Apache Arrow for columnar data format support +arrow = "54" +# Parquet support for Arrow +parquet = "54" # Release profile optimizations [profile.release] diff --git a/qdp/qdp-core/Cargo.toml b/qdp/qdp-core/Cargo.toml index 721ed5e54..729873b2c 100644 --- a/qdp/qdp-core/Cargo.toml +++ b/qdp/qdp-core/Cargo.toml @@ -9,6 +9,8 @@ qdp-kernels = { path = "../qdp-kernels" } thiserror = { workspace = true } rayon = { workspace = true } nvtx = { version = "1.3", optional = true } +arrow = { workspace = true } +parquet = { workspace = true } [lib] name = "qdp_core" diff --git a/qdp/qdp-core/src/error.rs b/qdp/qdp-core/src/error.rs index af2161bab..5cfaabedf 100644 --- a/qdp/qdp-core/src/error.rs +++ b/qdp/qdp-core/src/error.rs @@ -33,6 +33,9 @@ pub enum MahoutError { #[error("DLPack operation failed: {0}")] DLPack(String), + + #[error("I/O error: {0}")] + Io(String), } /// Result type alias for Mahout operations diff --git a/qdp/qdp-core/src/gpu/encodings/mod.rs b/qdp/qdp-core/src/gpu/encodings/mod.rs index 75cf57549..539355c2f 100644 --- a/qdp/qdp-core/src/gpu/encodings/mod.rs +++ b/qdp/qdp-core/src/gpu/encodings/mod.rs @@ -17,6 +17,7 @@ // Quantum encoding strategies (Strategy Pattern) use std::sync::Arc; +use arrow::array::Float64Array; use cudarc::driver::CudaDevice; use crate::error::Result; use crate::gpu::memory::GpuStateVector; @@ -33,6 +34,20 @@ pub trait QuantumEncoder: Send + Sync { num_qubits: usize, ) -> Result<GpuStateVector>; + /// Encode from chunked Arrow arrays + /// + /// Default implementation flattens chunks. (TODO: Encoders can override for true zero-copy.) + fn encode_chunked( + &self, + device: &Arc<CudaDevice>, + chunks: &[Float64Array], + num_qubits: usize, + ) -> Result<GpuStateVector> { + // Default: flatten and use regular encode + let data = crate::io::arrow_to_vec_chunked(chunks); + self.encode(device, &data, num_qubits) + } + /// Validate input data before encoding fn validate_input(&self, data: &[f64], num_qubits: usize) -> Result<()> { Preprocessor::validate_input(data, num_qubits) diff --git a/qdp/qdp-core/src/io.rs b/qdp/qdp-core/src/io.rs new file mode 100644 index 000000000..fc9f09cd4 --- /dev/null +++ b/qdp/qdp-core/src/io.rs @@ -0,0 +1,272 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! I/O module for reading and writing quantum data +//! +//! This module provides efficient columnar data exchange with the data science ecosystem, + +use std::fs::File; +use std::path::Path; +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, Float64Array, RecordBatch}; +use arrow::datatypes::{DataType, Field, Schema}; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use parquet::arrow::ArrowWriter; +use parquet::file::properties::WriterProperties; + +use crate::error::{MahoutError, Result}; + +/// Convert Arrow Float64Array to Vec<f64> +/// +/// Uses Arrow's internal buffer directly if no nulls, otherwise copies +pub fn arrow_to_vec(array: &Float64Array) -> Vec<f64> { + if array.null_count() == 0 { + array.values().to_vec() + } else { + array.iter().map(|opt| opt.unwrap_or(0.0)).collect() + } +} + +/// Convert chunked Arrow Float64Array to Vec<f64> +/// +/// Efficiently flattens multiple Arrow arrays into a single Vec +pub fn arrow_to_vec_chunked(arrays: &[Float64Array]) -> Vec<f64> { + let total_len: usize = arrays.iter().map(|a| a.len()).sum(); + let mut result = Vec::with_capacity(total_len); + + for array in arrays { + if array.null_count() == 0 { + result.extend_from_slice(array.values()); + } else { + result.extend(array.iter().map(|opt| opt.unwrap_or(0.0))); + } + } + + result +} + +/// Reads quantum data from a Parquet file. +/// +/// Expects a single column named "data" containing Float64 values. +/// This function performs one copy from Arrow to Vec. +/// use `read_parquet_to_arrow` instead. +/// +/// # Arguments +/// * `path` - Path to the Parquet file +/// +/// # Returns +/// Vector of f64 values from the first column +/// +/// # Example +/// ```no_run +/// use qdp_core::io::read_parquet; +/// +/// let data = read_parquet("quantum_data.parquet").unwrap(); +/// ``` +pub fn read_parquet<P: AsRef<Path>>(path: P) -> Result<Vec<f64>> { + let chunks = read_parquet_to_arrow(path)?; + Ok(arrow_to_vec_chunked(&chunks)) +} + +/// Writes quantum data to a Parquet file. +/// +/// Creates a single column named "data" containing Float64 values. +/// +/// # Arguments +/// * `path` - Path to write the Parquet file +/// * `data` - Vector of f64 values to write +/// * `column_name` - Optional column name (defaults to "data") +/// +/// # Example +/// ```no_run +/// use qdp_core::io::write_parquet; +/// +/// let data = vec![0.5, 0.5, 0.5, 0.5]; +/// write_parquet("quantum_data.parquet", &data, None).unwrap(); +/// ``` +pub fn write_parquet<P: AsRef<Path>>( + path: P, + data: &[f64], + column_name: Option<&str>, +) -> Result<()> { + if data.is_empty() { + return Err(MahoutError::InvalidInput( + "Cannot write empty data to Parquet".to_string(), + )); + } + + let col_name = column_name.unwrap_or("data"); + + // Create Arrow schema + let schema = Arc::new(Schema::new(vec![Field::new( + col_name, + DataType::Float64, + false, + )])); + + // Create Float64Array from slice + let array = Float64Array::from_iter_values(data.iter().copied()); + let array_ref: ArrayRef = Arc::new(array); + + // Create RecordBatch + let batch = RecordBatch::try_new(schema.clone(), vec![array_ref]).map_err(|e| { + MahoutError::Io(format!("Failed to create RecordBatch: {}", e)) + })?; + + // Write to Parquet file + let file = File::create(path.as_ref()).map_err(|e| { + MahoutError::Io(format!("Failed to create Parquet file: {}", e)) + })?; + + let props = WriterProperties::builder().build(); + let mut writer = ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| { + MahoutError::Io(format!("Failed to create Parquet writer: {}", e)) + })?; + + writer.write(&batch).map_err(|e| { + MahoutError::Io(format!("Failed to write Parquet batch: {}", e)) + })?; + + writer.close().map_err(|e| { + MahoutError::Io(format!("Failed to close Parquet writer: {}", e)) + })?; + + Ok(()) +} + +/// Reads quantum data from a Parquet file as Arrow arrays. +/// +/// Returns Arrow arrays directly from Parquet batches. +/// Each element in the returned Vec corresponds to one Parquet batch. +/// +/// Directly constructs the Arrow array from Parquet batches +/// +/// # Arguments +/// * `path` - Path to the Parquet file +/// +/// # Returns +/// Vector of Float64Arrays, one per Parquet batch +pub fn read_parquet_to_arrow<P: AsRef<Path>>(path: P) -> Result<Vec<Float64Array>> { + let file = File::open(path.as_ref()).map_err(|e| { + MahoutError::Io(format!("Failed to open Parquet file: {}", e)) + })?; + + let builder = ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| { + MahoutError::Io(format!("Failed to create Parquet reader: {}", e)) + })?; + + let mut reader = builder.build().map_err(|e| { + MahoutError::Io(format!("Failed to build Parquet reader: {}", e)) + })?; + + let mut arrays = Vec::new(); + + while let Some(batch_result) = reader.next() { + let batch = batch_result.map_err(|e| { + MahoutError::Io(format!("Failed to read Parquet batch: {}", e)) + })?; + + if batch.num_columns() == 0 { + return Err(MahoutError::Io( + "Parquet file has no columns".to_string(), + )); + } + + let column = batch.column(0); + if !matches!(column.data_type(), DataType::Float64) { + return Err(MahoutError::Io(format!( + "Expected Float64 column, got {:?}", + column.data_type() + ))); + } + + // Clone the Float64Array (reference-counted, no data copy) + let float_array = column + .as_any() + .downcast_ref::<Float64Array>() + .ok_or_else(|| { + MahoutError::Io("Failed to downcast to Float64Array".to_string()) + })? + .clone(); + + arrays.push(float_array); + } + + if arrays.is_empty() { + return Err(MahoutError::Io( + "Parquet file contains no data".to_string(), + )); + } + + Ok(arrays) +} + +/// Writes an Arrow Float64Array to a Parquet file. +/// +/// Writes from Arrow format to Parquet. +/// +/// # Arguments +/// * `path` - Path to write the Parquet file +/// * `array` - Float64Array to write +/// * `column_name` - Optional column name (defaults to "data") +pub fn write_arrow_to_parquet<P: AsRef<Path>>( + path: P, + array: &Float64Array, + column_name: Option<&str>, +) -> Result<()> { + if array.is_empty() { + return Err(MahoutError::InvalidInput( + "Cannot write empty array to Parquet".to_string(), + )); + } + + let col_name = column_name.unwrap_or("data"); + + // Create Arrow schema + let schema = Arc::new(Schema::new(vec![Field::new( + col_name, + DataType::Float64, + false, + )])); + + let array_ref: ArrayRef = Arc::new(array.clone()); + + // Create RecordBatch + let batch = RecordBatch::try_new(schema.clone(), vec![array_ref]).map_err(|e| { + MahoutError::Io(format!("Failed to create RecordBatch: {}", e)) + })?; + + // Write to Parquet file + let file = File::create(path.as_ref()).map_err(|e| { + MahoutError::Io(format!("Failed to create Parquet file: {}", e)) + })?; + + let props = WriterProperties::builder().build(); + let mut writer = ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| { + MahoutError::Io(format!("Failed to create Parquet writer: {}", e)) + })?; + + writer.write(&batch).map_err(|e| { + MahoutError::Io(format!("Failed to write Parquet batch: {}", e)) + })?; + + writer.close().map_err(|e| { + MahoutError::Io(format!("Failed to close Parquet writer: {}", e)) + })?; + + Ok(()) +} diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs index 6fa43af7e..406715edd 100644 --- a/qdp/qdp-core/src/lib.rs +++ b/qdp/qdp-core/src/lib.rs @@ -18,6 +18,7 @@ pub mod dlpack; pub mod gpu; pub mod error; pub mod preprocessing; +pub mod io; #[macro_use] mod profiling; @@ -25,6 +26,7 @@ mod profiling; pub use error::{MahoutError, Result}; use std::sync::Arc; +use arrow::array::Float64Array; use cudarc::driver::CudaDevice; use crate::dlpack::DLManagedTensor; use crate::gpu::get_encoder; @@ -85,6 +87,52 @@ impl QdpEngine { pub fn device(&self) -> &CudaDevice { &self.device } + + /// Encode from chunked Arrow arrays (zero-copy from Parquet) + /// + /// # Arguments + /// * `chunks` - Chunked Arrow Float64Arrays (from read_parquet_to_arrow) + /// * `num_qubits` - Number of qubits + /// * `encoding_method` - Strategy: "amplitude", "angle", or "basis" + /// + /// # Returns + /// DLPack pointer for zero-copy PyTorch integration + pub fn encode_chunked( + &self, + chunks: &[Float64Array], + num_qubits: usize, + encoding_method: &str, + ) -> Result<*mut DLManagedTensor> { + crate::profile_scope!("Mahout::EncodeChunked"); + + let encoder = get_encoder(encoding_method)?; + let state_vector = encoder.encode_chunked(&self.device, chunks, num_qubits)?; + let dlpack_ptr = { + crate::profile_scope!("Mahout::CreateDLPack"); + state_vector.to_dlpack() + }; + Ok(dlpack_ptr) + } + + /// Load data from Parquet file and encode into quantum state + /// + /// **ZERO-COPY**: Reads Parquet chunks directly without intermediate Vec allocation. + /// + /// # Arguments + /// * `path` - Path to Parquet file + /// * `num_qubits` - Number of qubits + /// * `encoding_method` - Strategy: "amplitude", "angle", or "basis" + pub fn encode_from_parquet( + &self, + path: &str, + num_qubits: usize, + encoding_method: &str, + ) -> Result<*mut DLManagedTensor> { + crate::profile_scope!("Mahout::EncodeFromParquet"); + + let chunks = crate::io::read_parquet_to_arrow(path)?; + self.encode_chunked(&chunks, num_qubits, encoding_method) + } } // Re-export key types for convenience diff --git a/qdp/qdp-core/tests/parquet_io.rs b/qdp/qdp-core/tests/parquet_io.rs new file mode 100644 index 000000000..7b45573ba --- /dev/null +++ b/qdp/qdp-core/tests/parquet_io.rs @@ -0,0 +1,163 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use qdp_core::io::{ + read_parquet, read_parquet_to_arrow, write_arrow_to_parquet, write_parquet, +}; +use arrow::array::Float64Array; +use std::fs; + +mod common; + +#[test] +fn test_write_and_read_parquet() { + let temp_path = "/tmp/test_quantum_data.parquet"; + let data = common::create_test_data(4); + + // Write data + write_parquet(temp_path, &data, None).unwrap(); + + // Read it back + let read_data = read_parquet(temp_path).unwrap(); + + // Verify + assert_eq!(data.len(), read_data.len()); + for (original, read) in data.iter().zip(read_data.iter()) { + assert!((original - read).abs() < 1e-10); + } + + // Cleanup + fs::remove_file(temp_path).unwrap(); +} + +#[test] +fn test_write_with_custom_column_name() { + let temp_path = "/tmp/test_custom_column.parquet"; + let data = vec![1.0, 2.0, 3.0, 4.0]; + + // Write with custom column name + write_parquet(temp_path, &data, Some("quantum_state")).unwrap(); + + // Read it back (column name doesn't matter for reading) + let read_data = read_parquet(temp_path).unwrap(); + + assert_eq!(data, read_data); + + // Cleanup + fs::remove_file(temp_path).unwrap(); +} + +#[test] +fn test_write_empty_data_fails() { + let temp_path = "/tmp/test_empty.parquet"; + let data: Vec<f64> = vec![]; + + let result = write_parquet(temp_path, &data, None); + assert!(result.is_err()); +} + +#[test] +fn test_read_nonexistent_file_fails() { + let result = read_parquet("/tmp/nonexistent_file_12345.parquet"); + assert!(result.is_err()); +} + +#[test] +fn test_arrow_roundtrip() { + let temp_path = "/tmp/test_arrow_roundtrip.parquet"; + let data = common::create_test_data(8); + let array = Float64Array::from(data.clone()); + + // Write Arrow array + write_arrow_to_parquet(temp_path, &array, None).unwrap(); + + // Read back as Arrow arrays (chunked) + let read_chunks = read_parquet_to_arrow(temp_path).unwrap(); + + // Verify total length + let total_len: usize = read_chunks.iter().map(|c| c.len()).sum(); + assert_eq!(array.len(), total_len); + + // Verify data integrity + let mut offset = 0; + for chunk in &read_chunks { + for i in 0..chunk.len() { + assert!((array.value(offset + i) - chunk.value(i)).abs() < 1e-10); + } + offset += chunk.len(); + } + + // Cleanup + fs::remove_file(temp_path).unwrap(); +} + +#[test] +fn test_write_empty_arrow_fails() { + let temp_path = "/tmp/test_empty_arrow.parquet"; + let array = Float64Array::from(Vec::<f64>::new()); + + let result = write_arrow_to_parquet(temp_path, &array, None); + assert!(result.is_err()); +} + +#[test] +fn test_large_dataset() { + let temp_path = "/tmp/test_large_dataset.parquet"; + let size = 1024; + let data: Vec<f64> = (0..size).map(|i| i as f64 / size as f64).collect(); + + // Write + write_parquet(temp_path, &data, None).unwrap(); + + // Read + let read_data = read_parquet(temp_path).unwrap(); + + // Verify size and sample values + assert_eq!(data.len(), read_data.len()); + assert!((data[0] - read_data[0]).abs() < 1e-10); + assert!((data[size - 1] - read_data[size - 1]).abs() < 1e-10); + + // Cleanup + fs::remove_file(temp_path).unwrap(); +} + +#[test] +fn test_chunked_read_api() { + let temp_path = "/tmp/test_chunked_api.parquet"; + let data = common::create_test_data(16); + + // Write test data + write_parquet(temp_path, &data, None).unwrap(); + let chunks = read_parquet_to_arrow(temp_path).unwrap(); + assert!(!chunks.is_empty()); + let total_len: usize = chunks.iter().map(|c| c.len()).sum(); + assert_eq!(total_len, data.len()); + for chunk in &chunks { + let buffer_ptr = chunk.values().as_ptr(); + assert!(!buffer_ptr.is_null()); + assert_eq!(buffer_ptr as usize % std::mem::align_of::<f64>(), 0); + + unsafe { + let slice = std::slice::from_raw_parts(buffer_ptr, chunk.len()); + for (i, &value) in slice.iter().enumerate() { + assert_eq!(value, chunk.value(i)); + } + } + } + + // Cleanup + fs::remove_file(temp_path).unwrap(); +}
