From 39fec53ca1182049700806a423f60d44f2a9676d Mon Sep 17 00:00:00 2001 From: kosiew Date: Tue, 14 Jan 2025 20:01:10 +0800 Subject: [PATCH] fix: correct LZ0 to LZO in compression options (#995) * fix: correct LZ0 to LZO in compression options * fix: disable LZO compression option and update tests to reflect its unavailability * fix: ruff format expected string in test_execution_plan * fix: update test for execution plan and add validation for invalid LZO compression * fix: remove LZO compression option and related test cases * ruff autoformat * fix: remove TODO comment regarding LZO compression implementation --- python/datafusion/dataframe.py | 6 ++++-- python/tests/test_dataframe.py | 2 ++ python/tests/test_functions.py | 18 +++++++++--------- src/dataframe.rs | 2 +- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index f8aef0c9..b0c1abda 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -57,7 +57,9 @@ class Compression(Enum): GZIP = "gzip" BROTLI = "brotli" LZ4 = "lz4" - LZ0 = "lz0" + # lzo is not implemented yet + # https://github.com/apache/arrow-rs/issues/6970 + # LZO = "lzo" ZSTD = "zstd" LZ4_RAW = "lz4_raw" @@ -696,10 +698,10 @@ def write_parquet( - "snappy": Snappy compression. - "gzip": Gzip compression. - "brotli": Brotli compression. - - "lz0": LZ0 compression. - "lz4": LZ4 compression. - "lz4_raw": LZ4_RAW compression. - "zstd": Zstandard compression. + Note: LZO is not yet implemented in arrow-rs and is therefore excluded. compression_level: Compression level to use. For ZSTD, the recommended range is 1 to 22, with the default being 4. Higher levels provide better compression but slower speed. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index fa5f4e8c..a1a871e9 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1115,6 +1115,8 @@ def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression) df.write_parquet(str(path), compression=compression) +# not testing lzo because it it not implemented yet +# https://github.com/apache/arrow-rs/issues/6970 @pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"]) def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression): # Test write_parquet with zstd, brotli, gzip default compression level, diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 01c6c9ce..add170c1 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -790,9 +790,9 @@ def test_hash_functions(df): ) assert result.column(2) == pa.array( [ - b("185F8DB32271FE25F561A6FC938B2E26" "4306EC304EDA518007D1764826381969"), - b("78AE647DC5544D227130A0682A51E30B" "C7777FBB6D8A8F17007463A3ECD1D524"), - b("BB7208BC9B5D7C04F1236A82A0093A5E" "33F40423D5BA8D4266F7092C3BA43B62"), + b("185F8DB32271FE25F561A6FC938B2E264306EC304EDA518007D1764826381969"), + b("78AE647DC5544D227130A0682A51E30BC7777FBB6D8A8F17007463A3ECD1D524"), + b("BB7208BC9B5D7C04F1236A82A0093A5E33F40423D5BA8D4266F7092C3BA43B62"), ] ) assert result.column(3) == pa.array( @@ -838,16 +838,16 @@ def test_hash_functions(df): ) assert result.column(5) == pa.array( [ - b("F73A5FBF881F89B814871F46E26AD3FA" "37CB2921C5E8561618639015B3CCBB71"), - b("B792A0383FB9E7A189EC150686579532" "854E44B71AC394831DAED169BA85CCC5"), - b("27988A0E51812297C77A433F63523334" "6AEE29A829DCF4F46E0F58F402C6CFCB"), + b("F73A5FBF881F89B814871F46E26AD3FA37CB2921C5E8561618639015B3CCBB71"), + b("B792A0383FB9E7A189EC150686579532854E44B71AC394831DAED169BA85CCC5"), + b("27988A0E51812297C77A433F635233346AEE29A829DCF4F46E0F58F402C6CFCB"), ] ) assert result.column(6) == pa.array( [ - b("FBC2B0516EE8744D293B980779178A35" "08850FDCFE965985782C39601B65794F"), - b("BF73D18575A736E4037D45F9E316085B" "86C19BE6363DE6AA789E13DEAACC1C4E"), - b("C8D11B9F7237E4034ADBCD2005735F9B" "C4C597C75AD89F4492BEC8F77D15F7EB"), + b("FBC2B0516EE8744D293B980779178A3508850FDCFE965985782C39601B65794F"), + b("BF73D18575A736E4037D45F9E316085B86C19BE6363DE6AA789E13DEAACC1C4E"), + b("C8D11B9F7237E4034ADBCD2005735F9BC4C597C75AD89F4492BEC8F77D15F7EB"), ] ) assert result.column(7) == result.column(1) # SHA-224 diff --git a/src/dataframe.rs b/src/dataframe.rs index 71a6fe60..b875480a 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -491,7 +491,7 @@ impl PyDataFrame { ZstdLevel::try_new(verify_compression_level(compression_level)? as i32) .map_err(|e| PyValueError::new_err(format!("{e}")))?, ), - "lz0" => Compression::LZO, + "lzo" => Compression::LZO, "lz4" => Compression::LZ4, "lz4_raw" => Compression::LZ4_RAW, "uncompressed" => Compression::UNCOMPRESSED,