import zarr
import caterva as cat
import numpy as np
import h5py as h5
import hdf5plugin as h5plugin
%load_ext memprofiler


shape = (8_000, 8_000)
chunks = (4_000, 100)
blocks = (500, 25)
dtype = np.dtype("f8")
itemsize = dtype.itemsize


data = np.arange(np.prod(shape), dtype=dtype).reshape(shape)


c_data = cat.asarray(data, chunks=chunks, blocks=blocks)


z_data = zarr.array(data, chunks=chunks)


f = h5.File('hdf5_file.h5', 'w', driver="core")
f.create_dataset("data", shape, chunks=chunks, data=data, **h5plugin.Blosc())
h_data = f["data"]


planes_dim0 = np.random.randint(0, shape[0], 100)


%%mprof_run -q caterva::dim0

for i in planes_dim0:
    block = c_data[i, :]


%%mprof_run -q zarr::dim0

for i in planes_dim0:
    block = z_data[i, :]


%%mprof_run -q hdf5::dim0

for i in planes_dim0:
    block = h_data[i, :]


planes_dim1 = np.random.randint(0, shape[1], 100)


%%mprof_run -q caterva::dim1

for i in planes_dim1:
    block = c_data[:, i]


%%mprof_run -q zarr::dim1

for i in planes_dim1:
    block = z_data[:, i]


%%mprof_run -q hdf5::dim1

for i in planes_dim1:
    block = h_data[:, i]


f.close()


%mprof_barplot --title "Getting data (lower is better)" --variable time --groupby 1 .*


shape = (8_000, 8_000)
chunks = (4_000, 100)
blocks = (500, 25)
dtype = np.dtype("f8")
itemsize = dtype.itemsize


c_data = cat.empty(shape, itemsize, chunks=chunks, blocks=blocks)

z_data = zarr.empty(shape, dtype=dtype, chunks=chunks)

f = h5.File('hdf5_file.h5', 'w', driver="core")
f.create_dataset("data", shape, chunks=chunks, **h5plugin.Blosc())
h_data = f["data"]


planes_dim0 = np.random.randint(0, shape[0], 100)
block_dim0 = np.arange(shape[0], dtype=dtype)


%%mprof_run -q caterva::dim0

for i in planes_dim0:
    c_data[i, :] = block_dim0


%%mprof_run -q zarr::dim0

for i in planes_dim0:
    z_data[i, :] = block_dim0


%%mprof_run -q hdf5::dim0

for i in planes_dim0:
    h_data[i, :] = block_dim0


planes_dim1 = np.random.randint(0, shape[1], 100)
block_dim1 = np.arange(shape[1], dtype=dtype)


%%mprof_run -q caterva::dim1

for i in planes_dim1:
    c_data[:, i] = block_dim1


%%mprof_run -q zarr::dim1

for i in planes_dim1:
    z_data[:, i] = block_dim1


%%mprof_run -q hdf5::dim1

for i in planes_dim1:
    h_data[:, i] = block_dim1


f.close()


%mprof_barplot --title "Setting data (lower is better)" --variable time --groupby 1 .*


import caterva as cat
import numpy as np

shape = (1_000, 1_000)
chunks = (500, 20)
blocks = (200, 10)
dtype = np.dtype("f8")
itemsize = dtype.itemsize

a = cat.empty(shape, itemsize, chunks=chunks, blocks=blocks)

for i in range(shape[0]):
    a[i] = np.linspace(0, 1, shape[1], dtype=dtype)


b = a[5:7, 5:10]

b.info


c = np.asarray(b)

c

array([[b'\x02H\x01\xcd \x80t?', b'\x9c\x89\x01\xf6\xc0\x99x?',
        b'6\xcb\x01\x1fa\xb3|?', b'h\x06\x01\xa4\x80f\x80?',
        b"5'\x81\xb8Ps\x82?"],
       [b'\x02H\x01\xcd \x80t?', b'\x9c\x89\x01\xf6\xc0\x99x?',
        b'6\xcb\x01\x1fa\xb3|?', b'h\x06\x01\xa4\x80f\x80?',
        b"5'\x81\xb8Ps\x82?"]], dtype='|S8')


c = np.asarray(b).view(dtype)

c

array([[0.00500501, 0.00600601, 0.00700701, 0.00800801, 0.00900901],
       [0.00500501, 0.00600601, 0.00700701, 0.00800801, 0.00900901]])


b[0] = np.arange(5, dtype=dtype)

c

array([[0.        , 1.        , 2.        , 3.        , 4.        ],
       [0.00500501, 0.00600601, 0.00700701, 0.00800801, 0.00900901]])


import caterva as cat
from struct import pack

urlpath = "arr_with_meta.caterva"

shape = (1_000, 1_000)
chunks = (500, 500)
blocks = (10, 250)

meta = {
    b"date": b"01/01/2021"
}

a = cat.full(shape, fill_value=pack("f", 3.14), chunks=chunks, blocks=blocks, meta=meta,
             urlpath=urlpath)


a = cat.open(urlpath)


a.meta.keys()

['caterva', 'date']


assert a.meta.get("date") == a.meta["date"]

a.meta["date"]

b'01/01/2021'


a.meta["date"] = b"08/01/2021"
try:
    a.meta["date"] = b"8/1/2021"
except ValueError as err:
    print(err)

The length of the content in a metalayer cannot change.


import msgpack

caterva_meta = msgpack.unpackb(a.meta.get("caterva"))

print(f"Format version: {caterva_meta[0]}")
print(f"N. dimensions: {caterva_meta[1]}")
print(f"Shape: {caterva_meta[2]}")
print(f"Chunks: {caterva_meta[3]}")
print(f"Blocks: {caterva_meta[4]}")

cat.remove(urlpath)

Format version: 0
N. dimensions: 2
Shape: [1000, 1000]
Chunks: [500, 500]
Blocks: [10, 250]


import iarray_community as ia
import numpy as np

shape = (1_000, 1_000)

chunks = (500, 500)
blocks = (100, 100)
dtype = np.float64


data = ia.zeros(shape, dtype=dtype, chunks=chunks, blocks=blocks, codec=ia.Codecs.LZ4)

data.info


data[0] = np.linspace(0, 1, shape[1], dtype=dtype)

s = data[0, 250:-740]

type(s)

numpy.ndarray

s

array([0.25025025, 0.25125125, 0.25225225, 0.25325325, 0.25425425,
       0.25525526, 0.25625626, 0.25725726, 0.25825826, 0.25925926])

Type	NDArray (Plainbuffer)
Itemsize	8
Shape	(2, 5)

Caterva: A Compressed And Multidimensional Container For Medium/Big Data¶

What Is Caterva?¶

Poster Outline¶

Background¶

Poster Outline¶

Why Caterva?¶

Use cases¶

Poster Outline¶

Double partitioning¶

Getting data¶

Setting data¶

Poster Outline¶

No data types¶

Buffer and array protocol¶

Poster Outline¶

Metalayers¶

Caterva metalayer¶

Poster Outline¶

ironArray¶

Computation performance in ironArray¶

ironArray Community¶

Poster Outline¶

Future Work¶

Acknowledgments¶

type	IArray
shape	(1000, 1000)
chunks	(500, 500)
blocks	(100, 100)
cratio	31250.00


First dimension	Second dimension