Some adjustments for cache sizes for Apple Silicon

Blosc · Jan 4, 2025 · 3616ded · 3616ded
1 parent 06350a0
commit 3616ded
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 22 deletions.
diff --git a/bench/ndarray/cengine-expr.py b/bench/ndarray/cengine-expr.py
@@ -3,16 +3,18 @@
 import numpy as np
 
 # Create some data operands
-N = 20_000   # working size of 6 GB
+N = 20_000   # working size of 3 GB
+dtype = "float32"
 chunks = (100, N)
 blocks = (1, N)
 chunks, blocks= None, None
 cparams = blosc2.CParams(clevel=1, codec=blosc2.Codec.LZ4)
+
 t0 = time()
-# a = blosc2.linspace(0, 1, N * N, dtype="float32", shape=(N, N), cparams=cparams)
-a = blosc2.linspace(0, 1, N * N, shape=(N, N), cparams=cparams, chunks=chunks, blocks=blocks)
-b = blosc2.linspace(1, 2, N * N, shape=(N, N), cparams=cparams, chunks=chunks, blocks=blocks)
-c = blosc2.linspace(-10, 10, N, cparams=cparams)  # broadcasting is supported
+a = blosc2.linspace(0, 1, N * N, dtype=dtype, shape=(N, N), cparams=cparams, chunks=chunks, blocks=blocks)
+b = blosc2.linspace(1, 2, N * N, dtype=dtype, shape=(N, N), cparams=cparams, chunks=chunks, blocks=blocks)
+#c = blosc2.linspace(-10, 10, N, dtype=dtype, cparams=cparams)  # broadcasting is supported
+c = blosc2.linspace(-10, 10, N * N, dtype=dtype, shape=(N, N), cparams=cparams)
 print("Time to create data: ", time() - t0)
 print("a.chunks, a.blocks, a.schunk.cratio: ", a.chunks, a.blocks, a.schunk.cratio)
 
@@ -21,13 +23,13 @@
 expr = ((a ** 3 + blosc2.sin(a * 2)) < c) & (b > 0)
 print(f"Time to create expression: {time() - t0:.5f}")
 
-# Evaluate while reducing (yep, reductions are in) along axis 1
+# Compute while reducing (yep, reductions are in) along axis 1
 t0 = time()
 out = expr[:]
 t1 = time() - t0
 print(f"Time to compute with Blosc2: {t1:.5f}")
 
-# Evaluate using NumPy operands
+# Compute using NumPy operands
 na, nb, nc = a[:], b[:], c[:]
 
 @blosc2.cengine
@@ -39,7 +41,7 @@ def compute_expression(na, nb, nc):
 t1 = time() - t0
 print(f"Time to compute with NumPy operands and Blosc2 engine: {t1:.5f}")
 
-# Evaluate using NumPy compute engine
+# Compute using NumPy compute engine
 t0 = time()
 nout = ((na ** 3 + np.sin(na * 2)) < nc) & (nb > 0)
 t2 = time() - t0

diff --git a/bench/ndarray/cengine-reduc.py b/bench/ndarray/cengine-reduc.py
@@ -3,16 +3,18 @@
 import numpy as np
 
 # Create some data operands
-N = 20_000   # working size of 6 GB
+N = 20_000   # working size of 3 GB
+dtype = "float32"
 chunks = (100, N)
 blocks = (1, N)
 chunks, blocks= None, None
 cparams = blosc2.CParams(clevel=1, codec=blosc2.Codec.LZ4)
+
 t0 = time()
-# a = blosc2.linspace(0, 1, N * N, dtype="float32", shape=(N, N), cparams=cparams)
-a = blosc2.linspace(0, 1, N * N, shape=(N, N), cparams=cparams, chunks=chunks, blocks=blocks)
-b = blosc2.linspace(1, 2, N * N, shape=(N, N), cparams=cparams, chunks=chunks, blocks=blocks)
-c = blosc2.linspace(-10, 10, N, cparams=cparams)  # broadcasting is supported
+a = blosc2.linspace(0, 1, N * N, dtype=dtype, shape=(N, N), cparams=cparams, chunks=chunks, blocks=blocks)
+b = blosc2.linspace(1, 2, N * N, dtype=dtype, shape=(N, N), cparams=cparams, chunks=chunks, blocks=blocks)
+c = blosc2.linspace(-10, 10, N, dtype=dtype, cparams=cparams)  # broadcasting is supported
+#c = blosc2.linspace(-10, 10, N * N, dtype=dtype, shape=(N, N), cparams=cparams)
 print("Time to create data: ", time() - t0)
 print("a.chunks, a.blocks, a.schunk.cratio: ", a.chunks, a.blocks, a.schunk.cratio)
 
@@ -21,13 +23,13 @@
 expr = ((a ** 3 + blosc2.sin(a * 2)) < c) & (b > 0)
 print(f"Time to create expression: {time() - t0:.5f}")
 
-# Evaluate while reducing (yep, reductions are in) along axis 1
+# Compute while reducing (yep, reductions are in) along axis 1
 t0 = time()
 out = blosc2.sum(expr, axis=1) # , cparams=cparams)
 t1 = time() - t0
 print(f"Time to compute with Blosc2: {t1:.5f}")
 
-# Evaluate using NumPy operands
+# Compute using NumPy operands
 na, nb, nc = a[:], b[:], c[:]
 
 @blosc2.cengine
@@ -42,7 +44,7 @@ def compute_expression(na, nb, nc):
 def compute_expression_numpy(na, nb, nc):
     return np.sum(((na ** 3 + np.sin(na * 2)) < nc) & (nb > 0), axis=1)
 
-# Evaluate using NumPy compute engine
+# Compute using NumPy compute engine
 t0 = time()
 nout = compute_expression_numpy(na, nb, nc)
 t2 = time() - t0

diff --git a/src/blosc2/core.py b/src/blosc2/core.py
@@ -1253,10 +1253,11 @@ def get_chunksize(blocksize, l3_minimum=2**20, l3_maximum=2**26):
     if isinstance(l2_cache_size, int) and l2_cache_size > chunksize:
         chunksize = l2_cache_size
 
-    # When evaluating expressions, it is convenient to keep chunks for all operands in L3 cache,
-    # so let's divide by 4 (3 operands + result is a typical situation for moderately complex
-    # expressions)
-    chunksize //= 4
+    # When computing expressions on Intel arch, it is convenient to keep chunks for all operands
+    # in L3 cache, so let's divide by 4 (3 operands + result is a typical situation for moderately
+    # complex expressions)
+    if platform.machine() == "x86_64":
+        chunksize //= 4
 
     # Ensure a minimum size
     if chunksize < l3_minimum:
@@ -1407,8 +1408,8 @@ def compute_chunks_blocks(  # noqa: C901
             # For modern Intel/AMD archs, experiments say to use half of the L2 cache size
             max_blocksize = blosc2.cpu_info["l2_cache_size"] // 2
         elif platform.system() == "Darwin" and "arm" in platform.machine():
-            # For Apple Silicon, experiments say to use half of the L1 cache size
-            max_blocksize = blosc2.cpu_info["l1_data_cache_size"] // 2
+            # For Apple Silicon, experiments say we can use the full L1 data cache size
+            max_blocksize = blosc2.cpu_info["l1_data_cache_size"]
         if "clevel" in cparams and cparams["clevel"] == 0:
             # Experiments show that, when no compression is used, it is not a good idea
             # to exceed half of private cache for the blocksize because speed suffers