From c761555604352902c2548b4b42715c8800fab5b6 Mon Sep 17 00:00:00 2001 From: Michael Innerberger Date: Fri, 26 May 2023 14:09:45 -0400 Subject: [PATCH 01/35] First attempt at draft alternative proposal for the table spec --- latest/index.bs | 338 +++++++++++------------------------------------- 1 file changed, 73 insertions(+), 265 deletions(-) diff --git a/latest/index.bs b/latest/index.bs index ed8b6aa4..2c1bd3f8 100644 --- a/latest/index.bs +++ b/latest/index.bs @@ -178,305 +178,113 @@ For this example we assume an image with 5 dimensions and axes called `t,c,z,y,x Tables {#table-layout} ---------------------- The following describes the expected layout for tabular data. -OME-NGFF tables are compatible with the [AnnData model](https://github.com/scverse/anndata). +
-.                             # Root folder, potentially in S3,
-│                             # with a flat list of images.
-│
-└── 123.zarr
-    |
-    ├── .zgroup
-    |
-    ├── .zattrs
+    .                             # Root folder, potentially in S3,
+    |                             # with a flat list of images.
     |
-    └── tables                # The tables group is a container which holds one or multiple tables that are compatible with AnnData.
+    └── 123.zarr
         |
-        │                     # The tables group MAY be in the root of the zarr file.
-        ├── .zgroup           # The tables group MAY be in root or in another group.
+        ├── .zgroup
         |
-        ├── .zattrs           # `.zattrs` MUST contain "tables", which lists the keys of the subgroups that are tables. In this case, the only table is "my_table".
-                              # hence `.zattrs` should be equal to `{ "tables": [ "my_table" ] }`.
+        ├── .zattrs
         |
-        └── my_table
-            │                     # The table group MAY be in the root of the zarr file.
-            ├── .zgroup           # The table group MAY be in root or in another group.
-            |
-            ├── .zattrs           # `.zattrs` MUST contain "type", which is set to `"ngff:region_table"`
-            |                     # `.zattrs` MUST contain "region", which is the path to the data the table is annotating.
-            |                     # "region" MUST be a single path (single region) or an array of paths (multiple regions).
-            |                     # "region" paths MUST be objects with a key "path" and the path value MUST be a string.
-            |                     # `.zattrs` MUST contain "region_key" if "region" is an array. "region_key" is the key in `obs` denoting which region a given row corresponds to.
-            |                     # `.zattrs` MAY contain "instance_key", which is the key in `obs` that denotes which instance in "region" the row corresponds to. If "instance_key" is not provided, the values from the `obs` `.zattrs` "_index" key is used.
-            │
-            ├── X                 # You MAY add an zarr array `X`.
-            │   │                 # `X` MUST not be a complex type (i.e., MUST be a single type)
-            │   │                 # `X` MAY be chunked as the user desires.
-            │   ├── .zarray
-            │   ├── 0.0
-            │   │   ...
-            │   └── n.m
-            |
-            ├── layers            # You MAY add a `layers` group, which contains dense matrices with the same shape as X.
-            │   │
-            │   ├── .zgroup
-            │   ├── .zattrs       # `.zattrs` MUST contain `"keys"`, which is an array of the names of the subgroups containing a `layer`.
-            │   │
-            │   └── layer_0       # You MAY add a zarr array for each layer
-            |       |             # Each layer array MUST have the same shape as X
-            |       |             # Each layer array SHOULD be chunked the same as X
-            |       ├── .zarray
-            |       |
-            |       ├── 0.0
-            │       │   ...
-            │       └── n.m
-            │        
-            ├── obs               # You MUST add an obs group container. The obs group holds a table of annotations on the rows in X.
-            │   │                 # The rows in obs MUST be index-matched to the rows in X.
-            │   ├── .zgroup
-            │   │                     
-            │   ├── .zattrs       # `.zattrs` MUST contain `"_index"`, which is the name of the column in obs to be used as the index.           
-            │   │                 # `.zattrs` MUST contain `"column-order"`, which is a list of the order of the non-_index columns.
-            │   │                 # `.zattrs` MUST contain `"encoding-type"`, which is set to `"dataframe"` by AnnData.
-            │   │                 # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.2.0"` by AnnData.
-            │   │      
-            │   └── col_0         # Each column in the obs table is a 1D zarr array. The rows can be chunked as the user desires.
-            │       ├── .zarray   # However, the obs columns SHOULD be chunked in the same way as the rows in X (if present).
-            │       │
-            │       └─ 0
-            ├── var               # You MAY add a var group container. The var group holds a table of annotations on the columns in X.
-            |   │                 # The rows in var MUST be index-matched to the columns in X (if present). 
-            |   |
-            |   ├── .zattrs       # `.zattrs` MUST contain `"_index"`, which is the name of the column in obs to be used as the index.           
-            |   │                 # `.zattrs` MUST contain `"column-order"`, which is a list of the order of the non-_index columns.
-            |   │                 # `.zattrs` MUST contain `"encoding-type"`, which is set to `"dataframe"` by AnnData.
-            |   │                 # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.2.0"` by AnnData.
-            |   │
-            |   ├── array_col     # Columns in the var table MAY be a 1D zarr array. The rows can be chunked as the user desires.
-            |   |   ├── .zarray   # However, the var columns SHOULD be chunked in the same way as the columns in X.
-            |   |   │
-            |   |   └─ 0
-            |   |
-            |   └── cat_col       # Columns in the var table MAY be categorical
-            |       ├── .zattrs.  # `.zattrs` MUST contain `"encoding-type"`, which is set to `"categorical"` by AnnData.
-            |       |             # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.2.0"` by AnnData.
-            |       |
-            |       ├── categories
-            |       |  ├── .zarray # categories MUST be a 1D zarr array. The rows can be chunked as the user desires.
-            |       |  |
-            |       |  └─ 0
-            |       ├── codes
-            |       |  ├── .zarray # codes MUST be a 1D zarr array. The rows can be chunked as the user desires.
-            |       |  |
-            |       |  └─ 0
-            |       |
-            |       ├── null_col  # Columns in the var table MAY nullable integer
-            |       ├── .zattrs.  # `.zattrs` MUST contain `"encoding-type"`, which is set to `"nullable-integer"` by AnnData.
-            |       |             # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.1.0"` by AnnData.
-            |       |
-            |       ├── mask
-            |       |  ├── .zarray # categories MUST be a 1D zarr array. The rows can be chunked as the user desires.
-            |       |  |
-            |       |  └─ 0
-            |       └── values
-            |          ├── .zarray # codes MUST be a 1D zarr array. The rows can be chunked as the user desires.
-            |          |
-            |          └─ 0
-            |
-            ├── obsm              # You MAY add a obsm group comtainer. The obsm group contains arrays that annotate the rows in X.
-            |   │                 # The rows in each array MUST be index-matched to the rows in X (if present). 
-            |   |
-            │   ├── .zgroup
-            |   |
-            |   ├── .zattrs       # `.zattrs` MUST contain `"encoding-type"`, which is set to `"dict"` by AnnData.           
-            |   │                 # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.1.0"` by AnnData.
-            |   |                 # `.zattrs` MUST contain `"keys"`, which is an array of the names of the subgroups containing `obsm` arrays.
-            |   │
-            │   └── obsm_0        # You MAY add a zarr array for each obsm matrix.
-            |       |             # Each obsm array MUST have the same number of rows as X.
-            |       |             # The rows in each obsm array SHOULD be chunked the same as the rows in X.
-            |       ├── .zarray
-            |       |
-            |       ├── 0.0
-            │       │   ...
-            │       └── n.m
+        └── my_table              # A table group is a container which holds all files associated with one table
             |
-            ├── varm              # You MAY add a varm group comtainer. The varm group contains arrays that annotate the columns in X.
-            |   │                 # The rows in each array MUST be index-matched to the columns in X (if present). 
-            |   |
-            │   ├── .zgroup
-            |   |
-            |   ├── .zattrs       # `.zattrs` MUST contain `"encoding-type"`, which is set to `"dict"` by AnnData.           
-            |   │                 # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.1.0"` by AnnData.
-            |   |                 # `.zattrs` MUST contain `"keys"`, which is an array of the names of the subgroups containing `varm` arrays.
-            |   │
-            │   └── varm_0        # You MAY add a zarr array for each varm matrix.
-            |       |             # Each varm array MUST have the same number of rows as columns in X.
-            |       |             # The rows in each obsm array SHOULD be chunked the same as the columns in X.
-            |       ├── .zarray
-            |       ├── 0.0
-            │       │   ...
-            │       └── n.m
+            |                     # The table group MAY be in the root of the zarr file.
+            ├── .zgroup           # The table group MAY be in root or in another group.
             |
-            ├── obsp              # You MAY add a obsp group comtainer. The obsp group contains sparse arrays that annotate the rows in X.
-            |   │                 # The rows in each array MUST be index-matched to the columns in X (if present). 
-            |   |
-            │   ├── .zgroup
-            |   |
-            |   ├── .zattrs       # `.zattrs` MUST contain `"encoding-type"`, which is set to `"dict"` by AnnData.           
-            |   │                 # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.1.0"` by AnnData.
-            |   |                 # `.zattrs` MUST contain `"keys"`, which is an array of the names of the subgroups containing `obsp` arrays.
-            |   │
-            │   └── obsp_0        # You MAY add a zarr group for each obsp array.
-            |       |             # Each obsp array MUST have the same number of rows as rows in X.
-            |       |
-            │       ├── .zgroup
-            |       |
-            |       ├── .zattrs   # `.zattrs` MUST contain `"encoding-type"`, which is set to `"csr_matrix"` or `"csc_matrix"` for compressed sparse row and compressed sparse column, respectively.          
-            |       │             # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.1.0"` by AnnData.
-            |       |             # `.zattrs` MUST contain `"shape"` which is an array giving the shape of the densified array.
-            |       |
-            |       ├── data      # You MUST add a one-dimensional zarr array named "data". 
-            |       |   |         # `data` MAY be chunked as the user desires.
-            |       |   ├── .zarray
-            |       |   |
-            |       |   ├── 0
-            │       │   |   ...
-            │       |   └── n
-            |       |
-            |       ├── indices   # You MUST add a one-dimensional zarr array named "indices".
-            |       |   |         # `indices` MAY be chunked as the user desires.
-            |       |   ├── .zarray  # `indices` MUST be an `int` dtype.
-            |       |   |
-            |       |   ├── 0
-            │       │   |   ...
-            │       |   └── n
-            |       |
-            |       └── indptr    # You MUST add a one-dimensional zarr array named "indptr".
-            |           |         # `indptr` MAY be chunked as the user desires.
-            |           ├── .zarray  # `indptr` MUST be an `int` dtype.
-            |           |
-            |           ├── 0
-            │           |   ...
-            │           └── n
+            ├── .zattrs           # `.zattrs` MUST contain "type", which is set to `"ngff:table"`
+            |                     # `.zattrs` MUST contain "annotated_data", which is the path to the data the table is annotating.
+            |                     # "annotated_data" MUST be a single path (single piece of data) or an array of paths (multiple data), which MUST be strings.
             |
-            ├── varp              # You MAY add a varp group comtainer. The varp group contains sparse arrays that annotate the columns in X.
-            |   │                 # The rows in each array MUST be index-matched to the columns in X (if present). 
-            |   |
-            │   ├── .zgroup
-            |   |
-            |   ├── .zattrs       # `.zattrs` MUST contain `"encoding-type"`, which is set to `"dict"` by AnnData.           
-            |   │                 # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.1.0"` by AnnData.
-            |   |                 # `.zattrs` MUST contain `"keys"`, which is an array of the names of the subgroups containing `varp` arrays.
-            |   │
-            │   └── varp_0        # You MAY add a zarr group for each varp array.
-            |       |             # Each varp array MUST have the same number of rows as columns in X.
-            |       |
-            │       ├── .zgroup
-            |       |
-            |       ├── .zattrs   # `.zattrs` MUST contain `"encoding-type"`, which is set to `"csr_matrix"` or `"csc_matrix"` for compressed sparse row and compressed sparse column, respectively.           
-            |       │             # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.1.0"` by AnnData.
-            |       |             # `.zattrs` MUST contain `"shape"` which is an array giving the shape of the densified array.
-            |       |
-            |       ├── data      # You MUST add a one-dimensional zarr array named "data". 
-            |       |   |         # `data` MAY be chunked as the user desires.
-            |       |   ├── .zarray
-            |       |   |
-            |       |   ├── 0
-            │       │   |   ...
-            │       |   └── n
-            |       |
-            |       ├── indices   # You MUST add a one-dimensional zarr array named "indices".
-            |       |   |         # `indices` MAY be chunked as the user desires.
-            |       |   ├── .zarray  # `indices` MUST be an `int` dtype.
-            |       |   |
-            |       |   ├── 0
-            │       │   |   ...
-            │       |   └── n
-            |       |
-            |       └── indptr    # You MUST add a one-dimensional zarr array named "indptr".
-            |           |         # `indptr` MAY be chunked as the user desires.
-            |           ├── .zarray  # `indptr` MUST be an `int` dtype.
-            |           |
-            |           ├── 0
-            │           |   ...
-            │           └── n
+            └── table_data        # The table group MUST contain a zarr array `table_data`.
+            |   |                 # `table_data` MUST not be a complex type (i.e., MUST be a single type)
+            |   |                 # `table_data` MAY be chunked as the user desires.
+            |   ├── .zarray
+            |   ├── 0.0
+            |   |   ...
+            |   └── n.m
             |
-            └── uns               # You MAY add a uns containter to store unstructured data.
+            └── axis_annotation_0 # The table MAY contain one ore more `axis_annotation_#` group containers.
+                |                 # These groups hold datasets and groups that represent annotations on one dimension of `table_data`.
+                |                 # The columns of all annotations in `axis_annotation_#` MUST be index-matched to the corresponding dimension of `table_data`.
+                |                 # The columns of all annotations in `axis_annotation_#` SHOULD be chunked in the same way as the corresponding dimension of `table_data`.
                 |
-                ├── .zgroup
+                ├── .zattrs       # `.zattrs` MUST contain `"axis"`, which is the name of the axis in `table_data` the group is annotating.
+                |                 # `.zattrs` MUST contain `"column-order"`, which is a list of the order of the annotations.
                 |
-                ├── .zattrs       # `.zattrs` MUST contain `"encoding-type"`, which is set to `"dict"` by AnnData.           
-                │                 # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.1.0"` by AnnData.
-                │
-                ├── group         # You MAY add zarr groups.
-                |   |             # `uns` groups MAY contain groups, dataframes, dense arrays, and sparse arrays.
-                |   |
-                |   ├── .zgroup
+                ├── dense_ann     # Annotations in the `axis_annotation_#` group MAY be a 1D or 2D zarr array. The rows can be chunked as the user desires.
+                |   ├── .zarray
+                |   ├── .zattrs   # If the array is 2D, `.zattrs` MAY contain a list of strings `"column_names"` that contains the headers of the columns of the array.
                 |   |
-                |   ├── .zattrs   # `.zattrs` MUST contain `"encoding-type"`, which is set to `"csr_matrix"` by AnnData.           
-                |   │             # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.1.0"` by AnnData.
-                |   ...
+                |   ├── 0.0
+                |   |   ...
+                |   └── n.m
                 |
-                ├── dataframe_0   # You MAY add dataframe group containers.
-                |   |                 # dataframes MAY be in the `uns` group or in a subgroup.
-                |   │
-                |   ├── .zgroup
-                |   │                     
-                |   ├── .zattrs       # `.zattrs` MUST contain `"_index"`, which is the name of the column in obs to be used as the index.           
-                |   │                 # `.zattrs` MUST contain `"column-order"`, which is a list of the order of the non-_index columns.
-                |   │                 # `.zattrs` MUST contain `"encoding-type"`, which is set to `"dataframe"` by AnnData.
-                |   │                 # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.2.0"` by AnnData.
-                |   │      
-                |   └── col_0         # Each column in the obs table is a 1D zarr array.
-                |       ├── .zarray   # Each columns MUST be chunked the same, but the chunking may be chosen by the user.
-                |       │
-                |       └─ 0
+                ├── cat_ann       # Annotations in the `axis_annotation_#` group MAY be categorical.
+                |   ├── .zattrs.  # `.zattrs` MUST contain `"encoding-type"`, which is set to `"categorical"`.
+                |   |
+                |   ├── categories
+                |   |   ├── .zarray   # `categories` MUST be a 1D zarr array of strings.
+                |   |   |
+                |   |   ├── 0
+                |   |   |   ...
+                |   |   └── n
+                |   └── codes
+                |       ├── .zarray   # `codes` MUST be a 1D zarr array of 0-based indices into the `categories` array.
+                |       |
+                |       ├── 0
+                |       |   ...
+                |       └── n
                 |
-                ├── dense_array       # You MAY dense arrays as n n-dimensional zarr arrays.
-                |   │                 # `dense_array` MUST not be a complex type (i.e., MUST be a single type)
-                |   │                 # `dense_array` MAY be chunked as the user desires.
-                |   |                 # `dense array` MAY be in the `uns` group or in a subgroup.
+                ├── null_ann      # Annotations in the `axis_annotation_#` group MAY be nullable integers.
+                |   ├── .zattrs.  # `.zattrs` MUST contain `"encoding-type"`, which is set to `"nullable-integer"`.
                 |   |
-                |   ├── .zarray
-                |   ├── 0.0
-                |   │   ...
-                |   └── n.m
+                |   ├── values
+                |   |   ├── .zarray   # `values` MUST be a 1D zarr array of integers.
+                |   |   |
+                |   |   ├── 0
+                |   |   |   ...
+                |   |   └── n
+                |   └── mask
+                |       ├── .zarray   # `mask` MUST be a 1D boolean zarr array of the same size as `values`.
+                |       |
+                |       ├── 0
+                |       |   ...
+                |       └── n
                 |
-                └── sparse_array  # You MAY add sparse arrays as a zarr group for each sparse array.
-                    |             # sparse arrays MAY be in the `uns` group or in a subgroup.
+                └── sparse_ann    # Annotations in the `axis_annotation_#` group MAY be sparse arrays.
                     |
                     ├── .zgroup
                     |
-                    ├── .zattrs      # `.zattrs` MUST contain `"encoding-type"`, which is set to `"csr_matrix"` or `"csc_matrix"` for compressed sparse row and compressed sparse column, respectively.           
-                    │                # `.zattrs` MUST contain `"encoding-version"`, which is set to `"0.1.0"` by AnnData.
-                    |                # `.zattrs` MUST contain `"shape"` which is an array giving the shape of the densified array.
+                    ├── .zattrs   # `.zattrs` MUST contain `"encoding-type"`, which is set to `"csr_matrix"` or `"csc_matrix"` for compressed sparse row and compressed sparse column, respectively.
+                    |             # `.zattrs` MUST contain `"shape"` which is an array giving the shape of the densified array.
                     |
-                    ├── data         # You MUST add a one-dimensional zarr array named "data". 
-                    |   |            # `data` MAY be chunked as the user desires.
+                    ├── data      # The sparse group MUST contain a one-dimensional zarr array `data`.
+                    |   |         # `data` MAY be chunked as the user desires.
                     |   ├── .zarray
                     |   |
                     |   ├── 0
-                    │   |   ...
+                    |   |   ...
                     |   └── n
                     |
-                    ├── indices      # You MUST add a one-dimensional zarr array named "indices".
-                    |   |            # `indices` MAY be chunked as the user desires.
-                    |   ├── .zarray  # `indices` MUST be an `int` dtype.
+                    ├── indices   # The sparse group MUST contain a one-dimensional zarr array `indices`.
+                    |   |         # `indices` MAY be chunked as the user desires but SHOULD be chunked in the same way as `data`.
+                    |   ├── .zarray   # `indices` MUST be an `int` dtype.
                     |   |
                     |   ├── 0
-                    │   |   ...
+                    |   |   ...
                     |   └── n
                     |
-                    └── indptr       # You MUST add a one-dimensional zarr array named "indptr".
-                        |            # `indptr` MAY be chunked as the user desires.
-                        ├── .zarray  # `indptr` MUST be an `int` dtype.
+                    └── indptr    # The sparse group MUST contain a one-dimensional zarr array `indptr`.
+                        |         # `indptr` MAY be chunked as the user desires but SHOULD be chunked in the same way as `data`.
+                        ├── .zarray   # `indptr` MUST be an `int` dtype.
                         |
                         ├── 0
                         |   ...
                         └── n
-        
-
 
High-content screening {#hcs-layout} From 27d1d2cd272a7652d755dad4f757b4be1883290f Mon Sep 17 00:00:00 2001 From: Michael Innerberger Date: Fri, 26 May 2023 14:37:07 -0400 Subject: [PATCH 02/35] Fix a few things after reviewing the spec --- latest/index.bs | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/latest/index.bs b/latest/index.bs index 2c1bd3f8..25ece648 100644 --- a/latest/index.bs +++ b/latest/index.bs @@ -196,8 +196,8 @@ The following describes the expected layout for tabular data. ├── .zgroup # The table group MAY be in root or in another group. | ├── .zattrs # `.zattrs` MUST contain "type", which is set to `"ngff:table"` - | # `.zattrs` MUST contain "annotated_data", which is the path to the data the table is annotating. - | # "annotated_data" MUST be a single path (single piece of data) or an array of paths (multiple data), which MUST be strings. + | # `.zattrs` MAY contain "annotated_data", which is the path to the data the table is annotating. + | # If present, "annotated_data" MUST be a single path (single piece of data) or an array of paths (multiple data), which MUST be strings. | └── table_data # The table group MUST contain a zarr array `table_data`. | | # `table_data` MUST not be a complex type (i.e., MUST be a single type) @@ -207,15 +207,15 @@ The following describes the expected layout for tabular data. | | ... | └── n.m | - └── axis_annotation_0 # The table MAY contain one ore more `axis_annotation_#` group containers. + └── axis_annotation_0 # The table MAY contain one ore more `axis_annotation_N` group containers. | # These groups hold datasets and groups that represent annotations on one dimension of `table_data`. - | # The columns of all annotations in `axis_annotation_#` MUST be index-matched to the corresponding dimension of `table_data`. - | # The columns of all annotations in `axis_annotation_#` SHOULD be chunked in the same way as the corresponding dimension of `table_data`. + | # The columns of all annotations in `axis_annotation_N` MUST be index-matched to the corresponding dimension of `table_data`. + | # The columns of all annotations in `axis_annotation_N` SHOULD be chunked in the same way as the corresponding dimension of `table_data`. | ├── .zattrs # `.zattrs` MUST contain `"axis"`, which is the name of the axis in `table_data` the group is annotating. | # `.zattrs` MUST contain `"column-order"`, which is a list of the order of the annotations. | - ├── dense_ann # Annotations in the `axis_annotation_#` group MAY be a 1D or 2D zarr array. The rows can be chunked as the user desires. + ├── dense_ann # Annotations in the `axis_annotation_N` group MAY be a 1D or 2D zarr array. The rows can be chunked as the user desires. | ├── .zarray | ├── .zattrs # If the array is 2D, `.zattrs` MAY contain a list of strings `"column_names"` that contains the headers of the columns of the array. | | @@ -223,7 +223,7 @@ The following describes the expected layout for tabular data. | | ... | └── n.m | - ├── cat_ann # Annotations in the `axis_annotation_#` group MAY be categorical. + ├── cat_ann # Annotations in the `axis_annotation_N` group MAY be categorical. | ├── .zattrs. # `.zattrs` MUST contain `"encoding-type"`, which is set to `"categorical"`. | | | ├── categories @@ -239,7 +239,7 @@ The following describes the expected layout for tabular data. | | ... | └── n | - ├── null_ann # Annotations in the `axis_annotation_#` group MAY be nullable integers. + ├── null_ann # Annotations in the `axis_annotation_N` group MAY be nullable integers. | ├── .zattrs. # `.zattrs` MUST contain `"encoding-type"`, which is set to `"nullable-integer"`. | | | ├── values @@ -255,12 +255,14 @@ The following describes the expected layout for tabular data. | | ... | └── n | - └── sparse_ann # Annotations in the `axis_annotation_#` group MAY be sparse arrays. + └── sparse_ann # Annotations in the `axis_annotation_N` group MAY be sparse arrays. | ├── .zgroup | ├── .zattrs # `.zattrs` MUST contain `"encoding-type"`, which is set to `"csr_matrix"` or `"csc_matrix"` for compressed sparse row and compressed sparse column, respectively. | # `.zattrs` MUST contain `"shape"` which is an array giving the shape of the densified array. + | # `.zattrs` MAY contain a list of strings `"column_names"` that contains the headers of the columns of the array. + | | ├── data # The sparse group MUST contain a one-dimensional zarr array `data`. | | # `data` MAY be chunked as the user desires. From d61a9e13ca40ce4f5cdb714bb0cd9a1971de5a47 Mon Sep 17 00:00:00 2001 From: Michael Innerberger Date: Thu, 1 Jun 2023 12:10:11 -0400 Subject: [PATCH 03/35] First draft of minimal table spec --- latest/index.bs | 140 +++++++++++++++++------------------------------- 1 file changed, 49 insertions(+), 91 deletions(-) diff --git a/latest/index.bs b/latest/index.bs index 25ece648..28237ae6 100644 --- a/latest/index.bs +++ b/latest/index.bs @@ -177,12 +177,42 @@ For this example we assume an image with 5 dimensions and axes called `t,c,z,y,x Tables {#table-layout} ---------------------- -The following describes the expected layout for tabular data. +The following describes the expected layout for tabular data. Tabular data is +usually used to annotate axes of existing datasets (e.g., a list of names of +the corresponding hyperslices, acquisition parameters, results from downstream +analysis). In case the table is annotating existing data, this data MUST be +specified by the following metadata: +```json +"annotated-data": [ # Array containing all dataset / axis pairs this table is annotating + { + "path": "/path/to/datasetA", + "axis": "x", + }, + { + "path": "/path/to/datasetA", + "axis": "y", + }, + { + "path": "/path/to/datasetB", + "axis": "t", + } +], +``` + +A dataset MAY backlink to annotations for its axes by having the following metadata in its `.zattrs`: + +```json +"annotations": { # Group containing at most one annotation path for each axis + "t": "/path/to/t-annotation", + "y": "/path/to/y-annotation" +}, +``` + +The on-disk format of a table group looks like this:
-    .                             # Root folder, potentially in S3,
-    |                             # with a flat list of images.
+    .                        # Root folder, potentially in S3.
     |
     └── 123.zarr
         |
@@ -190,103 +220,31 @@ The following describes the expected layout for tabular data.
         |
         ├── .zattrs
         |
-        └── my_table              # A table group is a container which holds all files associated with one table
+        └── table            # A table group is a container which holds datasets of shape [N] or [N,M], where M is arbitrary and N MUST be the same across all datasets.
             |
-            |                     # The table group MAY be in the root of the zarr file.
-            ├── .zgroup           # The table group MAY be in root or in another group.
+            ├── .zgroup      # The table group MAY be in the root of the zarr file or in another group.
             |
-            ├── .zattrs           # `.zattrs` MUST contain "type", which is set to `"ngff:table"`
-            |                     # `.zattrs` MAY contain "annotated_data", which is the path to the data the table is annotating.
-            |                     # If present, "annotated_data" MUST be a single path (single piece of data) or an array of paths (multiple data), which MUST be strings.
+            ├── .zattrs      # `.zattrs` MUST contain "type", which is set to `"ngff:table"`
+            |                # `.zattrs` MUST contain "column-order", which is a list of the order of the annotations.
+            |                # `.zattrs` MUST contain "annotated-data", which is specified above.
             |
-            └── table_data        # The table group MUST contain a zarr array `table_data`.
-            |   |                 # `table_data` MUST not be a complex type (i.e., MUST be a single type)
-            |   |                 # `table_data` MAY be chunked as the user desires.
-            |   ├── .zarray
+            ├── annotation_1 # The group can hold an arbitrary number of datasets subject to the above restrictions.
+            |   |
+            |   ├── .zarray  # If `annotated-data` is not empty, the dimension N MUST correspond to the dimensions of all axes the table is annotating.
+            |   ├── .zattrs  # If `annotated-data` contains exactly one entry, the dimension N of all datasets in the table group
+            |   |            #     SHOULD be chunked in the same way as the corresponding dimension the table is annotating.
             |   ├── 0.0
             |   |   ...
             |   └── n.m
             |
-            └── axis_annotation_0 # The table MAY contain one ore more `axis_annotation_N` group containers.
-                |                 # These groups hold datasets and groups that represent annotations on one dimension of `table_data`.
-                |                 # The columns of all annotations in `axis_annotation_N` MUST be index-matched to the corresponding dimension of `table_data`.
-                |                 # The columns of all annotations in `axis_annotation_N` SHOULD be chunked in the same way as the corresponding dimension of `table_data`.
+            └── _index       # The talbe group SHOULD contain a 1D dataset of strings called `_index` (the names of annotated hyperslices).
                 |
-                ├── .zattrs       # `.zattrs` MUST contain `"axis"`, which is the name of the axis in `table_data` the group is annotating.
-                |                 # `.zattrs` MUST contain `"column-order"`, which is a list of the order of the annotations.
+                ├── .zarray
+                ├── .zattrs
                 |
-                ├── dense_ann     # Annotations in the `axis_annotation_N` group MAY be a 1D or 2D zarr array. The rows can be chunked as the user desires.
-                |   ├── .zarray
-                |   ├── .zattrs   # If the array is 2D, `.zattrs` MAY contain a list of strings `"column_names"` that contains the headers of the columns of the array.
-                |   |
-                |   ├── 0.0
-                |   |   ...
-                |   └── n.m
-                |
-                ├── cat_ann       # Annotations in the `axis_annotation_N` group MAY be categorical.
-                |   ├── .zattrs.  # `.zattrs` MUST contain `"encoding-type"`, which is set to `"categorical"`.
-                |   |
-                |   ├── categories
-                |   |   ├── .zarray   # `categories` MUST be a 1D zarr array of strings.
-                |   |   |
-                |   |   ├── 0
-                |   |   |   ...
-                |   |   └── n
-                |   └── codes
-                |       ├── .zarray   # `codes` MUST be a 1D zarr array of 0-based indices into the `categories` array.
-                |       |
-                |       ├── 0
-                |       |   ...
-                |       └── n
-                |
-                ├── null_ann      # Annotations in the `axis_annotation_N` group MAY be nullable integers.
-                |   ├── .zattrs.  # `.zattrs` MUST contain `"encoding-type"`, which is set to `"nullable-integer"`.
-                |   |
-                |   ├── values
-                |   |   ├── .zarray   # `values` MUST be a 1D zarr array of integers.
-                |   |   |
-                |   |   ├── 0
-                |   |   |   ...
-                |   |   └── n
-                |   └── mask
-                |       ├── .zarray   # `mask` MUST be a 1D boolean zarr array of the same size as `values`.
-                |       |
-                |       ├── 0
-                |       |   ...
-                |       └── n
-                |
-                └── sparse_ann    # Annotations in the `axis_annotation_N` group MAY be sparse arrays.
-                    |
-                    ├── .zgroup
-                    |
-                    ├── .zattrs   # `.zattrs` MUST contain `"encoding-type"`, which is set to `"csr_matrix"` or `"csc_matrix"` for compressed sparse row and compressed sparse column, respectively.
-                    |             # `.zattrs` MUST contain `"shape"` which is an array giving the shape of the densified array.
-                    |             # `.zattrs` MAY contain a list of strings `"column_names"` that contains the headers of the columns of the array.
-                    |
-                    |
-                    ├── data      # The sparse group MUST contain a one-dimensional zarr array `data`.
-                    |   |         # `data` MAY be chunked as the user desires.
-                    |   ├── .zarray
-                    |   |
-                    |   ├── 0
-                    |   |   ...
-                    |   └── n
-                    |
-                    ├── indices   # The sparse group MUST contain a one-dimensional zarr array `indices`.
-                    |   |         # `indices` MAY be chunked as the user desires but SHOULD be chunked in the same way as `data`.
-                    |   ├── .zarray   # `indices` MUST be an `int` dtype.
-                    |   |
-                    |   ├── 0
-                    |   |   ...
-                    |   └── n
-                    |
-                    └── indptr    # The sparse group MUST contain a one-dimensional zarr array `indptr`.
-                        |         # `indptr` MAY be chunked as the user desires but SHOULD be chunked in the same way as `data`.
-                        ├── .zarray   # `indptr` MUST be an `int` dtype.
-                        |
-                        ├── 0
-                        |   ...
-                        └── n
+                ├── 0
+                |   .
+                └── n
 
High-content screening {#hcs-layout} From 3d38a15a8ae2c500752a48ff3a3d2df1de514b6a Mon Sep 17 00:00:00 2001 From: Michael Innerberger Date: Thu, 1 Jun 2023 17:20:10 -0400 Subject: [PATCH 04/35] Include suggestions by @d-v-b --- latest/index.bs | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/latest/index.bs b/latest/index.bs index 28237ae6..8ccf31d2 100644 --- a/latest/index.bs +++ b/latest/index.bs @@ -178,37 +178,28 @@ For this example we assume an image with 5 dimensions and axes called `t,c,z,y,x Tables {#table-layout} ---------------------- The following describes the expected layout for tabular data. Tabular data is -usually used to annotate axes of existing datasets (e.g., a list of names of +usually used to annotate axes of existing arrays (e.g., a list of names of the corresponding hyperslices, acquisition parameters, results from downstream analysis). In case the table is annotating existing data, this data MUST be specified by the following metadata: ```json -"annotated-data": [ # Array containing all dataset / axis pairs this table is annotating +"annotated-data": [ # Array containing all array / axis pairs this table is annotating { - "path": "/path/to/datasetA", - "axis": "x", + "array": "/path/to/arrayA", + "dimension": 1, }, { - "path": "/path/to/datasetA", - "axis": "y", + "array": "/path/to/arrayA", + "dimension": 2, }, { - "path": "/path/to/datasetB", - "axis": "t", + "array": "/path/to/arrayB", + "dimension": 2, } ], ``` -A dataset MAY backlink to annotations for its axes by having the following metadata in its `.zattrs`: - -```json -"annotations": { # Group containing at most one annotation path for each axis - "t": "/path/to/t-annotation", - "y": "/path/to/y-annotation" -}, -``` - The on-disk format of a table group looks like this:
@@ -220,7 +211,7 @@ The on-disk format of a table group looks like this:
         |
         ├── .zattrs
         |
-        └── table            # A table group is a container which holds datasets of shape [N] or [N,M], where M is arbitrary and N MUST be the same across all datasets.
+        └── table            # A table group is a container which holds arrays of shape [N] or [N,M], where M is arbitrary and N MUST be the same across all arrays.
             |
             ├── .zgroup      # The table group MAY be in the root of the zarr file or in another group.
             |
@@ -228,16 +219,16 @@ The on-disk format of a table group looks like this:
             |                # `.zattrs` MUST contain "column-order", which is a list of the order of the annotations.
             |                # `.zattrs` MUST contain "annotated-data", which is specified above.
             |
-            ├── annotation_1 # The group can hold an arbitrary number of datasets subject to the above restrictions.
+            ├── annotation_1 # The group can hold an arbitrary number of arrays subject to the above restrictions.
             |   |
             |   ├── .zarray  # If `annotated-data` is not empty, the dimension N MUST correspond to the dimensions of all axes the table is annotating.
-            |   ├── .zattrs  # If `annotated-data` contains exactly one entry, the dimension N of all datasets in the table group
+            |   ├── .zattrs  # If `annotated-data` contains exactly one entry, the dimension N of all arrays in the table group
             |   |            #     SHOULD be chunked in the same way as the corresponding dimension the table is annotating.
             |   ├── 0.0
             |   |   ...
             |   └── n.m
             |
-            └── _index       # The talbe group SHOULD contain a 1D dataset of strings called `_index` (the names of annotated hyperslices).
+            └── _index       # The table group SHOULD contain a 1D array of strings called `_index` (the names of annotated hyperslices).
                 |
                 ├── .zarray
                 ├── .zattrs

From ea990e83bd6adc4e79e57fe1f7218810066e063b Mon Sep 17 00:00:00 2001
From: Virginia Scarlett 
Date: Fri, 2 Jun 2023 11:43:53 -0400
Subject: [PATCH 05/35] overhaul tables section introduction text

---
 latest/index.bs | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/latest/index.bs b/latest/index.bs
index 8ccf31d2..6a388d94 100644
--- a/latest/index.bs
+++ b/latest/index.bs
@@ -175,13 +175,22 @@ For this example we assume an image with 5 dimensions and axes called `t,c,z,y,x
                 └── n
 
-Tables {#table-layout} +Tables and table annotations {#table-layout} ---------------------- -The following describes the expected layout for tabular data. Tabular data is -usually used to annotate axes of existing arrays (e.g., a list of names of -the corresponding hyperslices, acquisition parameters, results from downstream -analysis). In case the table is annotating existing data, this data MUST be -specified by the following metadata: +The following section describes the expected layout for tables within an OME-NGFF file. +Typically, a table is a 2-dimensional data structure consisting of rows and columns. +In this schema, a table is a zarr group containing one or more n-dimensional arrays. +We expect that 2-dimensional arrays will be typical, but there may be use cases in which +it is sensible to conceptualize a 1D, 3D, or other-dimensional array as a table. + +It is often useful to organize related tables together. In particular, the AnnData +data model is based on a main table with additional 'annotation' tables that have the same +number of rows, columns, or both, as the main table. AnnData tables can be stored in this +section of an OME-NGFF file. See the AnnData documentation for detailed recommendations on +storing AnnData data structures within Zarr. Here, an AnnData dataset corresponds to a table +group, and the .zattrs file for the table group MUST contain the "annotated-data" property, +as shown below: + ```json "annotated-data": [ # Array containing all array / axis pairs this table is annotating From 0b3b652b9a36b7e7de61fe7a2e129052dc5d215d Mon Sep 17 00:00:00 2001 From: Virginia Scarlett Date: Fri, 2 Jun 2023 11:54:28 -0400 Subject: [PATCH 06/35] start messing around with tables tree diagram --- latest/index.bs | 58 +++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/latest/index.bs b/latest/index.bs index 6a388d94..f6c25ecb 100644 --- a/latest/index.bs +++ b/latest/index.bs @@ -179,21 +179,21 @@ Tables and table annotations {#table-layout} ---------------------- The following section describes the expected layout for tables within an OME-NGFF file. Typically, a table is a 2-dimensional data structure consisting of rows and columns. -In this schema, a table is a zarr group containing one or more n-dimensional arrays. +In this schema, a table is a Zarr group containing one or more n-dimensional arrays. We expect that 2-dimensional arrays will be typical, but there may be use cases in which it is sensible to conceptualize a 1D, 3D, or other-dimensional array as a table. It is often useful to organize related tables together. In particular, the AnnData data model is based on a main table with additional 'annotation' tables that have the same -number of rows, columns, or both, as the main table. AnnData tables can be stored in this -section of an OME-NGFF file. See the AnnData documentation for detailed recommendations on -storing AnnData data structures within Zarr. Here, an AnnData dataset corresponds to a table -group, and the .zattrs file for the table group MUST contain the "annotated-data" property, +number of rows, columns, or both, as the main table. AnnData tables can be stored in the tables/ +directory of an OME-NGFF file. See the AnnData documentation for detailed recommendations on +formatting AnnData data structures within Zarr. An OME-NGFF table group can be an AnnData dataset, +in which case the .zattrs file for the table group MUST contain the "annotated-data" property, as shown below: ```json -"annotated-data": [ # Array containing all array / axis pairs this table is annotating +"annotated-data": [ # A JSON array containing all array / axis pairs this table is annotating { "array": "/path/to/arrayA", "dimension": 1, @@ -220,31 +220,33 @@ The on-disk format of a table group looks like this: | ├── .zattrs | - └── table # A table group is a container which holds arrays of shape [N] or [N,M], where M is arbitrary and N MUST be the same across all arrays. + └──tables | - ├── .zgroup # The table group MAY be in the root of the zarr file or in another group. - | - ├── .zattrs # `.zattrs` MUST contain "type", which is set to `"ngff:table"` - | # `.zattrs` MUST contain "column-order", which is a list of the order of the annotations. - | # `.zattrs` MUST contain "annotated-data", which is specified above. - | - ├── annotation_1 # The group can hold an arbitrary number of arrays subject to the above restrictions. - | | - | ├── .zarray # If `annotated-data` is not empty, the dimension N MUST correspond to the dimensions of all axes the table is annotating. - | ├── .zattrs # If `annotated-data` contains exactly one entry, the dimension N of all arrays in the table group - | | # SHOULD be chunked in the same way as the corresponding dimension the table is annotating. - | ├── 0.0 - | | ... - | └── n.m - | - └── _index # The table group SHOULD contain a 1D array of strings called `_index` (the names of annotated hyperslices). + └── table # A table group is a container which holds arrays of shape [N] or [N,M], where M is arbitrary and N MUST be the same across all arrays. | - ├── .zarray - ├── .zattrs + ├── .zgroup # The table group MAY be in the root of the zarr file or in another group. | - ├── 0 - | . - └── n + ├── .zattrs # `.zattrs` MUST contain "type", which is set to `"ngff:table"` + | # `.zattrs` MUST contain "column-order", which is a list of the order of the annotations. + | # `.zattrs` MUST contain "annotated-data", which is specified above. + | + ├── annotation_1 # The group can hold an arbitrary number of arrays subject to the above restrictions. + | | + | ├── .zarray # If `annotated-data` is not empty, the dimension N MUST correspond to the dimensions of all axes the table is annotating. + | ├── .zattrs # If `annotated-data` contains exactly one entry, the dimension N of all arrays in the table group + | | # SHOULD be chunked in the same way as the corresponding dimension the table is annotating. + | ├── 0.0 + | | ... + | └── n.m + | + └── _index # The table group SHOULD contain a 1D array of strings called `_index` (the names of annotated hyperslices). + | + ├── .zarray + ├── .zattrs + | + ├── 0 + | . + └── n High-content screening {#hcs-layout} From 0f58543e66396508120e9a57a0c9713602e5a622 Mon Sep 17 00:00:00 2001 From: Virginia Scarlett Date: Fri, 2 Jun 2023 16:42:21 -0400 Subject: [PATCH 07/35] augment and clarify tables introduction --- latest/index.bs | 50 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/latest/index.bs b/latest/index.bs index f6c25ecb..c5b66416 100644 --- a/latest/index.bs +++ b/latest/index.bs @@ -177,19 +177,21 @@ For this example we assume an image with 5 dimensions and axes called `t,c,z,y,x Tables and table annotations {#table-layout} ---------------------- -The following section describes the expected layout for tables within an OME-NGFF file. -Typically, a table is a 2-dimensional data structure consisting of rows and columns. -In this schema, a table is a Zarr group containing one or more n-dimensional arrays. -We expect that 2-dimensional arrays will be typical, but there may be use cases in which -it is sensible to conceptualize a 1D, 3D, or other-dimensional array as a table. +A table is a 2-dimensional data structure consisting of rows and columns. +Many domain-specific workflows involve tables with certain standard properties. +Tables can also simply be an intuitive format for storing arbitrary data +associated with an experiment. -It is often useful to organize related tables together. In particular, the AnnData -data model is based on a main table with additional 'annotation' tables that have the same -number of rows, columns, or both, as the main table. AnnData tables can be stored in the tables/ -directory of an OME-NGFF file. See the AnnData documentation for detailed recommendations on -formatting AnnData data structures within Zarr. An OME-NGFF table group can be an AnnData dataset, -in which case the .zattrs file for the table group MUST contain the "annotated-data" property, -as shown below: +In OME-NGFF, a table is a Zarr group containing one or more Zarr arrays, where each +array represents one column of the table. Columns are ordered, and each column in a +table MUST have the same number of rows. While the table itself MUST be 2-dimensional, +the columns need not be. 1-dimensional columns will be typical. Nevertheless, there +may be use cases in which it is sensible to conceptualize a 2D, 3D, or higher-dimensional +array as a single column. + +Tables in an OME-NGFF file are located in the tables/ directory, in the root of an image's +Zarr group, alongside the labels/ directory if one is present. The .zattrs file immediately +within tables/ MUST contain the "annotated-data" property, as shown below: ```json @@ -209,6 +211,12 @@ as shown below: ], ``` +There MAY be intermediate directories between tables/ and a particular table. +These SHOULD NOT contain metadata, unless the intermediate directory represents the parent +directory for an AnnData object. (See note below.) The names of directories beneath the tables/ +directory are arbitrary, except in the AnnData case. + + The on-disk format of a table group looks like this:
@@ -221,16 +229,19 @@ The on-disk format of a table group looks like this:
         ├── .zattrs
         |
         └──tables
+            ├── .zgroup
+            |
+            ├── .zattrs
             |
-            └── table            # A table group is a container which holds arrays of shape [N] or [N,M], where M is arbitrary and N MUST be the same across all arrays.
+            └── table_1            # A table group is a container which holds arrays of shape [N] or [N,M], where M is arbitrary and N MUST be the same across all arrays.
                 |
-                ├── .zgroup      # The table group MAY be in the root of the zarr file or in another group.
+                ├── .zgroup      
                 |
                 ├── .zattrs      # `.zattrs` MUST contain "type", which is set to `"ngff:table"`
                 |                # `.zattrs` MUST contain "column-order", which is a list of the order of the annotations.
                 |                # `.zattrs` MUST contain "annotated-data", which is specified above.
                 |
-                ├── annotation_1 # The group can hold an arbitrary number of arrays subject to the above restrictions.
+                ├── column_1 # The group can hold an arbitrary number of arrays subject to the above restrictions.
                 |   |
                 |   ├── .zarray  # If `annotated-data` is not empty, the dimension N MUST correspond to the dimensions of all axes the table is annotating.
                 |   ├── .zattrs  # If `annotated-data` contains exactly one entry, the dimension N of all arrays in the table group
@@ -239,7 +250,7 @@ The on-disk format of a table group looks like this:
                 |   |   ...
                 |   └── n.m
                 |
-                └── _index       # The table group SHOULD contain a 1D array of strings called `_index` (the names of annotated hyperslices).
+                └── _index       # The table group SHOULD contain a 1D array of strings called `_index` (the names of annotated columns).
                     |
                     ├── .zarray
                     ├── .zattrs
@@ -249,6 +260,13 @@ The on-disk format of a table group looks like this:
                     └── n
 
+ +Note: The AnnData data model is based on a main table with additional 'annotation' +tables that have the same number of rows, columns, or both, as the main table. AnnData +objects can be stored in the tables/ directory of an OME-NGFF file. See the AnnData +documentation for detailed recommendations on formatting AnnData data structures within Zarr. + + High-content screening {#hcs-layout} ------------------------------------ From eef3b3ee05a7c7f18543d1ae2f010481901a28b5 Mon Sep 17 00:00:00 2001 From: Michael Innerberger Date: Mon, 5 Jun 2023 12:00:35 -0400 Subject: [PATCH 08/35] Homogenize dimension requirement for columns --- latest/index.bs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/latest/index.bs b/latest/index.bs index c5b66416..d13d379f 100644 --- a/latest/index.bs +++ b/latest/index.bs @@ -220,7 +220,7 @@ directory are arbitrary, except in the AnnData case. The on-disk format of a table group looks like this:
-    .                        # Root folder, potentially in S3.
+    .                            # Root folder, potentially in S3.
     |
     └── 123.zarr
         |
@@ -233,7 +233,7 @@ The on-disk format of a table group looks like this:
             |
             ├── .zattrs
             |
-            └── table_1            # A table group is a container which holds arrays of shape [N] or [N,M], where M is arbitrary and N MUST be the same across all arrays.
+            └── table_1          # A table group is a container which holds arrays of arbitrary dimension, where the number of rows N MUST be the same across all arrays.
                 |
                 ├── .zgroup      
                 |
@@ -241,16 +241,16 @@ The on-disk format of a table group looks like this:
                 |                # `.zattrs` MUST contain "column-order", which is a list of the order of the annotations.
                 |                # `.zattrs` MUST contain "annotated-data", which is specified above.
                 |
-                ├── column_1 # The group can hold an arbitrary number of arrays subject to the above restrictions.
+                ├── column_1     # The group can hold an arbitrary number of arrays subject to the above restrictions.
                 |   |
-                |   ├── .zarray  # If `annotated-data` is not empty, the dimension N MUST correspond to the dimensions of all axes the table is annotating.
+                |   ├── .zarray  # If `annotated-data` is not empty, the number of rows N MUST correspond to the dimensions of all axes the table is annotating.
                 |   ├── .zattrs  # If `annotated-data` contains exactly one entry, the dimension N of all arrays in the table group
                 |   |            #     SHOULD be chunked in the same way as the corresponding dimension the table is annotating.
                 |   ├── 0.0
                 |   |   ...
                 |   └── n.m
                 |
-                └── _index       # The table group SHOULD contain a 1D array of strings called `_index` (the names of annotated columns).
+                └── _index       # The table group SHOULD contain a 1D array of strings called `_index` (the names of rows in the table).
                     |
                     ├── .zarray
                     ├── .zattrs

From e86478241177e3ab2f77d54aad526c4a815b717e Mon Sep 17 00:00:00 2001
From: Michael Innerberger 
Date: Mon, 5 Jun 2023 12:01:38 -0400
Subject: [PATCH 09/35] Homogenize appearance of zarr structure

---
 latest/index.bs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/latest/index.bs b/latest/index.bs
index d13d379f..0539fd72 100644
--- a/latest/index.bs
+++ b/latest/index.bs
@@ -225,18 +225,16 @@ The on-disk format of a table group looks like this:
     └── 123.zarr
         |
         ├── .zgroup
-        |
         ├── .zattrs
         |
         └──tables
-            ├── .zgroup
             |
+            ├── .zgroup
             ├── .zattrs
             |
             └── table_1          # A table group is a container which holds arrays of arbitrary dimension, where the number of rows N MUST be the same across all arrays.
                 |
                 ├── .zgroup      
-                |
                 ├── .zattrs      # `.zattrs` MUST contain "type", which is set to `"ngff:table"`
                 |                # `.zattrs` MUST contain "column-order", which is a list of the order of the annotations.
                 |                # `.zattrs` MUST contain "annotated-data", which is specified above.

From 2d6f1532aefdcd6974e55f70fd8088b724d45a63 Mon Sep 17 00:00:00 2001
From: Michael Innerberger 
Date: Mon, 5 Jun 2023 15:16:18 -0400
Subject: [PATCH 10/35] Add improvements based on discussion with
 @virginiascarlett

---
 latest/index.bs | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/latest/index.bs b/latest/index.bs
index 0539fd72..42f75d69 100644
--- a/latest/index.bs
+++ b/latest/index.bs
@@ -195,7 +195,7 @@ within tables/ MUST contain the "annotated-data" property, as shown below:
 
 
 ```json
-"annotated-data": [               # A JSON array containing all array / axis pairs this table is annotating
+"annotated-data": [               # A JSON array containing all array / dimension pairs this table is annotating
     {
         "array": "/path/to/arrayA",
         "dimension": 1,
@@ -232,29 +232,31 @@ The on-disk format of a table group looks like this:
             ├── .zgroup
             ├── .zattrs
             |
-            └── table_1          # A table group is a container which holds arrays of arbitrary dimension, where the number of rows N MUST be the same across all arrays.
+            └── table_1          # A table group holds arrays of arbitrary dimension, where the number of rows n MUST be the same across all arrays.
                 |
-                ├── .zgroup      
-                ├── .zattrs      # `.zattrs` MUST contain "type", which is set to `"ngff:table"`
-                |                # `.zattrs` MUST contain "column-order", which is a list of the order of the annotations.
+                ├── .zgroup
+                ├── .zattrs      # `.zattrs` MUST contain "column-order", which is a list of the order of the columns.
                 |                # `.zattrs` MUST contain "annotated-data", which is specified above.
+                |                # If `annotated-data` is not empty, the number of rows n MUST correspond to the dimensions of all axes the table is annotating.
+                |                # If `annotated-data` contains exactly one entry, the dimension n of all arrays in the table group
+                |                #     SHOULD be chunked in the same way as the corresponding dimension the table is annotating.
                 |
-                ├── column_1     # The group can hold an arbitrary number of arrays subject to the above restrictions.
+                ├── column_1     # The group can hold zero, one, or an arbitrary number of arrays.
+                |   |
+                |   ├── .zarray
+                |   ├── .zattrs
                 |   |
-                |   ├── .zarray  # If `annotated-data` is not empty, the number of rows N MUST correspond to the dimensions of all axes the table is annotating.
-                |   ├── .zattrs  # If `annotated-data` contains exactly one entry, the dimension N of all arrays in the table group
-                |   |            #     SHOULD be chunked in the same way as the corresponding dimension the table is annotating.
                 |   ├── 0.0
                 |   |   ...
                 |   └── n.m
                 |
-                └── _index       # The table group SHOULD contain a 1D array of strings called `_index` (the names of rows in the table).
+                └── row_names    # The table group SHOULD contain a 1D array of strings called `row_names`.
                     |
                     ├── .zarray
                     ├── .zattrs
                     |
                     ├── 0
-                    |   .
+                    |   :
                     └── n
 
From 44625479ffe30197063108d8755621f79daa9d90 Mon Sep 17 00:00:00 2001 From: Virginia Scarlett Date: Mon, 5 Jun 2023 16:14:31 -0400 Subject: [PATCH 11/35] clarify purpose of annotations --- latest/index.bs | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/latest/index.bs b/latest/index.bs index 42f75d69..42d8fe8b 100644 --- a/latest/index.bs +++ b/latest/index.bs @@ -193,7 +193,6 @@ Tables in an OME-NGFF file are located in the tables/ directory, in the root of Zarr group, alongside the labels/ directory if one is present. The .zattrs file immediately within tables/ MUST contain the "annotated-data" property, as shown below: - ```json "annotated-data": [ # A JSON array containing all array / dimension pairs this table is annotating { @@ -211,9 +210,21 @@ within tables/ MUST contain the "annotated-data" property, as shown below: ], ``` -There MAY be intermediate directories between tables/ and a particular table. +The annotated-data metadata specifies that a table is annotating a particular dimension of an array. +Tables MAY annotate arrays, and can only annotate arrays. If a table is annotating an array, +then the first dimension of the column(s) (i.e. the number of rows in the table) MUST correspond +to the length of the dimension that is being annotated in the source array. If `annotated-data` contains +exactly one entry, the first dimension of all arrays in the table group SHOULD be chunked in the same way +as the corresponding dimension of the source array. + +Note: The AnnData data model is based on a main array with additional 'annotation' +tables that have the same number of rows, columns, or both, as the main table. AnnData +objects can be stored in the tables/ directory of an OME-NGFF file. See the AnnData +documentation for detailed recommendations on formatting AnnData data structures within Zarr. + +There MAY be one or more intermediate directories between tables/ and a particular table. These SHOULD NOT contain metadata, unless the intermediate directory represents the parent -directory for an AnnData object. (See note below.) The names of directories beneath the tables/ +directory for an AnnData object. The names of directories beneath the tables/ directory are arbitrary, except in the AnnData case. @@ -237,11 +248,8 @@ The on-disk format of a table group looks like this: ├── .zgroup ├── .zattrs # `.zattrs` MUST contain "column-order", which is a list of the order of the columns. | # `.zattrs` MUST contain "annotated-data", which is specified above. - | # If `annotated-data` is not empty, the number of rows n MUST correspond to the dimensions of all axes the table is annotating. - | # If `annotated-data` contains exactly one entry, the dimension n of all arrays in the table group - | # SHOULD be chunked in the same way as the corresponding dimension the table is annotating. - | - ├── column_1 # The group can hold zero, one, or an arbitrary number of arrays. + | + ├── column_1 # The table group MAY hold zero, one, or an arbitrary number of column arrays. | | | ├── .zarray | ├── .zattrs @@ -261,12 +269,6 @@ The on-disk format of a table group looks like this: -Note: The AnnData data model is based on a main table with additional 'annotation' -tables that have the same number of rows, columns, or both, as the main table. AnnData -objects can be stored in the tables/ directory of an OME-NGFF file. See the AnnData -documentation for detailed recommendations on formatting AnnData data structures within Zarr. - - High-content screening {#hcs-layout} ------------------------------------ From d794f22a1ce9dfba0dc02a783e2d254087651f31 Mon Sep 17 00:00:00 2001 From: Virginia Scarlett Date: Mon, 5 Jun 2023 16:20:30 -0400 Subject: [PATCH 12/35] update first tables paragraph --- latest/index.bs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/latest/index.bs b/latest/index.bs index 42d8fe8b..1d935eca 100644 --- a/latest/index.bs +++ b/latest/index.bs @@ -178,9 +178,8 @@ For this example we assume an image with 5 dimensions and axes called `t,c,z,y,x Tables and table annotations {#table-layout} ---------------------- A table is a 2-dimensional data structure consisting of rows and columns. -Many domain-specific workflows involve tables with certain standard properties. -Tables can also simply be an intuitive format for storing arbitrary data -associated with an experiment. +Tables are an intuitive way of storing arbitrary data or metadata that may be large, +highly structured, and may or may not be associated with an image. In OME-NGFF, a table is a Zarr group containing one or more Zarr arrays, where each array represents one column of the table. Columns are ordered, and each column in a From dd9ece89b1240a2f2e03403d4fd73a21fa5febc4 Mon Sep 17 00:00:00 2001 From: Virginia Scarlett Date: Mon, 5 Jun 2023 16:31:52 -0400 Subject: [PATCH 13/35] change tables section title --- latest/index.bs | 2 +- latest/index.html | 1243 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1244 insertions(+), 1 deletion(-) create mode 100644 latest/index.html diff --git a/latest/index.bs b/latest/index.bs index 1d935eca..0beffc31 100644 --- a/latest/index.bs +++ b/latest/index.bs @@ -175,7 +175,7 @@ For this example we assume an image with 5 dimensions and axes called `t,c,z,y,x └── n -Tables and table annotations {#table-layout} +Tables {#table-layout} ---------------------- A table is a 2-dimensional data structure consisting of rows and columns. Tables are an intuitive way of storing arbitrary data or metadata that may be large, diff --git a/latest/index.html b/latest/index.html new file mode 100644 index 00000000..67cc9513 --- /dev/null +++ b/latest/index.html @@ -0,0 +1,1243 @@ + + + + + Next-generation file formats (NGFF) + + + + + + + + + + + + + +
+

+

Next-generation file formats (NGFF)

+

Editor’s Draft,

+
+ More details about this document + +
+
+ +
+
+
+

Abstract

+

This document contains next-generation file format (NGFF) + +specifications for storing bioimaging data in the cloud. +All specifications are submitted to the https://image.sc community for review.

+
+

Status of this document

+
+

+

The current released version of this specification is 0.4. Migration scripts +will be provided between numbered versions. Data written with these latest changes +(an "editor’s draft") will not necessarily be supported.

+
+
+ +
+

1. Introduction

+

Bioimaging science is at a crossroads. Currently, the drive to acquire more, +larger, preciser spatial measurements is unfortunately at odds with our ability +to structure and share those measurements with others. During a global pandemic +more than ever, we believe fervently that global, collaborative discovery as +opposed to the post-publication, "data-on-request" mode of operation is the +path forward. Bioimaging data should be shareable via open and commercial cloud +resources without the need to download entire datasets.

+

At the moment, that is not the norm. The plethora of data formats produced by +imaging systems are ill-suited to remote sharing. Individual scientists +typically lack the infrastructure they need to host these data themselves. When +they acquire images from elsewhere, time-consuming translations and data +cleaning are needed to interpret findings. Those same costs are multiplied when +gathering data into online repositories where curator time can be the limiting +factor before publication is possible. Without a common effort, each lab or +resource is left building the tools they need and maintaining that +infrastructure often without dedicated funding.

+

This document defines a specification for bioimaging data to make it possible +to enable the conversion of proprietary formats into a common, cloud-ready one. +Such next-generation file formats layout data so that individual portions, or +"chunks", of large data are reference-able eliminating the need to download +entire datasets.

+

1.1. Why "NGFF"?

+

A short description of what is needed for an imaging format is "a hierarchy +of n-dimensional (dense) arrays with metadata". This combination of features +is certainly provided by HDF5 from the HDF Group, which a number of +bioimaging formats do use. HDF5 and other larger binary structures, however, +are ill-suited for storage in the cloud where accessing individual chunks +of data by name rather than seeking through a large file is at the heart of +parallelization.

+

As a result, a number of formats have been developed more recently which provide +the basic data structure of an HDF5 file, but do so in a more cloud-friendly way. +In the PyData community, the Zarr [zarr] format was developed +for easily storing collections of NumPy arrays. In the ImageJ community, N5 [n5] was developed to work around +the limitations of HDF5 ("N5" was originally short for "Not-HDF5"). +Both of these formats permit storing individual chunks of data either locally in +separate files or in cloud-based object stores as separate keys.

+

A current effort is underway to unify the two similar specifications to provide a single binary +specification. The editor’s draft will soon be entering a request for comments (RFC) phase with the goal of having a first version early in 2021. As that +process comes to an end, this document will be updated.

+

1.2. OME-NGFF

+

The conventions and specifications defined in this document are designed to +enable next-generation file formats to represent the same bioimaging data +that can be represented in OME-TIFF and beyond. However, the conventions will also be usable by HDF5 and other sufficiently advanced +binary containers. Eventually, we hope, the moniker "next-generation" will no longer be +applicable, and this will simply be the most efficient, common, and useful representation +of bioimaging data, whether during acquisition or sharing in the cloud.

+

Note: The following text makes use of OME-Zarr [ome-zarr-py], the current prototype implementation, +for all examples.

+

1.3. Document conventions

+

The key words “MUST”, “MUST NOT”, “REQUIRED”, “SHALL”, “SHALL NOT”, “SHOULD”, “SHOULD NOT”, +“RECOMMENDED”, “MAY”, and “OPTIONAL” are to be interpreted as described in RFC 2119.

+

Some of the JSON examples in this document include commments. However, these are only for +clarity purposes and comments MUST NOT be included in JSON objects.

+

2. On-disk (or in-cloud) layout

+

An overview of the layout of an OME-Zarr fileset should make +understanding the following metadata sections easier. The hierarchy +is represented here as it would appear locally but could equally +be stored on a web server to be accessed via HTTP or in object storage +like S3 or GCS.

+

OME-Zarr is an implementation of the OME-NGFF specification using the Zarr +format. Arrays MUST be defined and stored in a hierarchical organization as +defined by the version 2 of the Zarr specification . +OME-NGFF metadata MUST be stored as attributes in the corresponding Zarr +groups.

+

2.1. Images

+

The following layout describes the expected Zarr hierarchy for images with +multiple levels of resolutions and optionally associated labels. +Note that the number of dimensions is variable between 2 and 5 and that axis names are arbitrary, see § 3.3 "multiscales" metadata for details. +For this example we assume an image with 5 dimensions and axes called t,c,z,y,x.

+
.                             # Root folder, potentially in S3,
+│                             # with a flat list of images by image ID.
+│
+├── 123.zarr                  # One image (id=123) converted to Zarr.
+│
+└── 456.zarr                  # Another image (id=456) converted to Zarr.
+    │
+    ├── .zgroup               # Each image is a Zarr group, or a folder, of other groups and arrays.
+    ├── .zattrs               # Group level attributes are stored in the .zattrs file and include
+    │                         # "multiscales" and "omero" (see below). In addition, the group level attributes
+    │                         # must also contain "_ARRAY_DIMENSIONS" if this group directly contains multi-scale arrays.
+    │
+    ├── 0                     # Each multiscale level is stored as a separate Zarr array,
+    │   ...                   # which is a folder containing chunk files which compose the array.
+    ├── n                     # The name of the array is arbitrary with the ordering defined by
+    │   │                     # by the "multiscales" metadata, but is often a sequence starting at 0.
+    │   │
+    │   ├── .zarray           # All image arrays must be up to 5-dimensional
+    │   │                     # with the axis of type time before type channel, before spatial axes.
+    │   │
+    │   └─ t                  # Chunks are stored with the nested directory layout.
+    │      └─ c               # All but the last chunk element are stored as directories.
+    │         └─ z            # The terminal chunk is a file. Together the directory and file names
+    │            └─ y         # provide the "chunk coordinate" (t, c, z, y, x), where the maximum coordinate
+    │               └─ x      # will be dimension_size / chunk_size.
+    │
+    └── labels
+        │
+        ├── .zgroup           # The labels group is a container which holds a list of labels to make the objects easily discoverable
+        │
+        ├── .zattrs           # All labels will be listed in .zattrs e.g. { "labels": [ "original/0" ] }
+        │                     # Each dimension of the label (t, c, z, y, x) should be either the same as the
+        │                     # corresponding dimension of the image, or 1 if that dimension of the label
+        │                     # is irrelevant.
+        │
+        └── original          # Intermediate folders are permitted but not necessary and currently contain no extra metadata.
+            │
+            └── 0             # Multiscale, labeled image. The name is unimportant but is registered in the "labels" group above.
+                ├── .zgroup   # Zarr Group which is both a multiscaled image as well as a labeled image.
+                ├── .zattrs   # Metadata of the related image and as well as display information under the "image-label" key.
+                │
+                ├── 0         # Each multiscale level is stored as a separate Zarr array, as above, but only integer values
+                │   ...       # are supported.
+                └── n
+
+

2.2. Tables

+ A table is a 2-dimensional data structure consisting of rows and columns. +Tables are an intuitive way of storing arbitrary data or metadata that may be large, +highly structured, and may or may not be associated with an image. +

In OME-NGFF, a table is a Zarr group containing one or more Zarr arrays, where each +array represents one column of the table. Columns are ordered, and each column in a +table MUST have the same number of rows. While the table itself MUST be 2-dimensional, +the columns need not be. 1-dimensional columns will be typical. Nevertheless, there +may be use cases in which it is sensible to conceptualize a 2D, 3D, or higher-dimensional +array as a single column.

+

Tables in an OME-NGFF file are located in the tables/ directory, in the root of an image’s +Zarr group, alongside the labels/ directory if one is present. The .zattrs file immediately +within tables/ MUST contain the "annotated-data" property, as shown below:

+
"annotated-data": [               # A JSON array containing all array / dimension pairs this table is annotating
+    {
+        "array": "/path/to/arrayA",
+        "dimension": 1,
+    },
+    {
+        "array": "/path/to/arrayA",
+        "dimension": 2,
+    },
+    {
+        "array": "/path/to/arrayB",
+        "dimension": 2,
+    }
+],
+
+

The annotated-data metadata specifies that a table is annotating a particular dimension of an array. +Tables MAY annotate arrays, and can only annotate arrays. If a table is annotating an array, +then the first dimension of the column(s) (i.e. the number of rows in the table) MUST correspond +to the length of the dimension that is being annotated in the source array. If annotated-data contains +exactly one entry, the first dimension of all arrays in the table group SHOULD be chunked in the same way +as the corresponding dimension of the source array.

+

Note: The AnnData data model is based on a main array with additional annotation tables that have the same number of rows, columns, or both, as the main table. AnnData +objects can be stored in the tables/ directory of an OME-NGFF file. See the AnnData +documentation for detailed recommendations on formatting AnnData data structures within Zarr.

+

There MAY be one or more intermediate directories between tables/ and a particular table. +These SHOULD NOT contain metadata, unless the intermediate directory represents the parent +directory for an AnnData object. The names of directories beneath the tables/ +directory are arbitrary, except in the AnnData case.

+

The on-disk format of a table group looks like this:

+
.                            # Root folder, potentially in S3.
+|
+└── 123.zarr
+    |
+    ├── .zgroup
+    ├── .zattrs
+    |
+    └──tables
+        |
+        ├── .zgroup
+        ├── .zattrs
+        |
+        └── table_1          # A table group holds arrays of arbitrary dimension, where the number of rows n MUST be the same across all arrays.
+            |
+            ├── .zgroup
+            ├── .zattrs      # .zattrs MUST contain "column-order", which is a list of the order of the columns.
+            |                # .zattrs MUST contain "annotated-data", which is specified above.
+            |     
+            ├── column_1     # The table group MAY hold zero, one, or an arbitrary number of column arrays.
+            |   |
+            |   ├── .zarray
+            |   ├── .zattrs
+            |   |
+            |   ├── 0.0
+            |   |   ...
+            |   └── n.m
+            |
+            └── row_names    # The table group SHOULD contain a 1D array of strings called row_names.
+                |
+                ├── .zarray
+                ├── .zattrs
+                |
+                ├── 0
+                |   :
+                └── n
+
+

2.3. High-content screening

+

The following specification defines the hierarchy for a high-content screening +dataset. Three groups MUST be defined above the images:

+
    +
  • +

    the group above the images defines the well and MUST implement the well specification. All images contained in a well are fields +of view of the same well

    +
  • +

    the group above the well defines a row of wells

    +
  • +

    the group above the well row defines an entire plate i.e. a two-dimensional +collection of wells organized in rows and columns. It MUST implement the plate specification

    +
+

A well row group SHOULD NOT be present if there are no images in the well row. +A well group SHOULD NOT be present if there are no images in the well.

+
.                             # Root folder, potentially in S3,
+│
+└── 5966.zarr                 # One plate (id=5966) converted to Zarr
+    ├── .zgroup
+    ├── .zattrs               # Implements "plate" specification
+    ├── A                     # First row of the plate
+    │   ├── .zgroup
+    │   │
+    │   ├── 1                 # First column of row A
+    │   │   ├── .zgroup
+    │   │   ├── .zattrs       # Implements "well" specification
+    │   │   │
+    │   │   ├── 0             # First field of view of well A1
+    │   │   │   │
+    │   │   │   ├── .zgroup
+    │   │   │   ├── .zattrs   # Implements "multiscales", "omero"
+    │   │   │   ├── 0
+    │   │   │   │   ...       # Resolution levels
+    │   │   │   ├── n
+    │   │   │   └── labels    # Labels (optional)
+    │   │   ├── ...           # Fields of view
+    │   │   └── m
+    │   ├── ...               # Columns
+    │   └── 12
+    ├── ...                   # Rows
+    └── H
+
+

3. Metadata

+

The various .zattrs files throughout the above array hierarchy may contain metadata +keys as specified below for discovering certain types of data, especially images.

+

3.1. "axes" metadata

+

"axes" describes the dimensions of a physical coordinate space. It is a list of dictionaries, where each dictionary describes a dimension (axis) and:

+ +

If part of § 3.3 "multiscales" metadata, the length of "axes" MUST be equal to the number of dimensions of the arrays that contain the image data.

+

3.2. "coordinateTransformations" metadata

+

"coordinateTransformations" describe a series of transformations that map between two coordinate spaces (defined by "axes"). +For example, to map a discrete data space of an array to the corresponding physical space. +It is a list of dictionaries. Each entry describes a single transformation and MUST contain the field "type". +The value of "type" MUST be one of the elements of the type column in the table below. +Additional fields for the entry depend on "type" and are defined by the column fields.

+ + + + + + + +
identity + + identity transformation, is the default transformation and is typically not explicitly defined +
translation + one of: "translation":List[float], "path":str + translation vector, stored either as a list of floats ("translation") or as binary data at a location in this container (path). The length of vector defines number of dimensions. | +
scale + one of: "scale":List[float], "path":str + scale vector, stored either as a list of floats (scale) or as binary data at a location in this container (path). The length of vector defines number of dimensions. | +
type + fields + description +
+

The transformations in the list are applied sequentially and in order.

+

3.3. "multiscales" metadata

+

Metadata about an image can be found under the "multiscales" key in the group-level metadata. Here, image refers to 2 to 5 dimensional data representing image or volumetric data with optional time or channel axes. It is stored in a multiple resolution representation.

+

"multiscales" contains a list of dictionaries where each entry describes a multiscale image.

+

Each "multiscales" dictionary MUST contain the field "axes", see § 3.1 "axes" metadata. +The length of "axes" must be between 2 and 5 and MUST be equal to the dimensionality of the zarr arrays storing the image data (see "datasets:path"). +The "axes" MUST contain 2 or 3 entries of "type:space" and MAY contain one additional entry of "type:time" and MAY contain one additional entry of "type:channel" or a null / custom type. +The order of the entries MUST correspond to the order of dimensions of the zarr arrays. In addition, the entries MUST be ordered by "type" where the "time" axis must come first (if present), followed by the "channel" or custom axis (if present) and the axes of type "space". +If there are three spatial axes where two correspond to the image plane ("yx") and images are stacked along the other (anisotropic) axis ("z"), the spatial axes SHOULD be ordered as "zyx".

+

Each "multiscales" dictionary MUST contain the field "datasets", which is a list of dictionaries describing the arrays storing the individual resolution levels. +Each dictionary in "datasets" MUST contain the field "path", whose value contains the path to the array for this resolution relative +to the current zarr group. The "path"s MUST be ordered from largest (i.e. highest resolution) to smallest.

+

Each "datasets" dictionary MUST have the same number of dimensions and MUST NOT have more than 5 dimensions. The number of dimensions and order MUST correspond to number and order of "axes". +Each dictionary in "datasets" MUST contain the field "coordinateTransformations", which contains a list of transformations that map the data coordinates to the physical coordinates (as specified by "axes") for this resolution level. +The transformations are defined according to § 3.2 "coordinateTransformations" metadata. The transformation MUST only be of type translation or scale. +They MUST contain exactly one scale transformation that specifies the pixel size in physical units or time duration. If scaling information is not available or applicable for one of the axes, the value MUST express the scaling factor between the current resolution and the first resolution for the given axis, defaulting to 1.0 if there is no downsampling along the axis. +It MAY contain exactly one translation that specifies the offset from the origin in physical units. If translation is given it MUST be listed after scale to ensure that it is given in physical coordinates. +The length of the scale and translation array MUST be the same as the length of "axes". +The requirements (only scale and translation, restrictions on order) are in place to provide a simple mapping from data coordinates to physical coordinates while being compatible with the general transformation spec.

+

Each "multiscales" dictionary MAY contain the field "coordinateTransformations", describing transformations that are applied to all resolution levels in the same manner. +The transformations MUST follow the same rules about allowed types, order, etc. as in "datasets:coordinateTransformations" and are applied after them. +They can for example be used to specify the scale for a dimension that is the same for all resolutions.

+

Each "multiscales" dictionary SHOULD contain the field "name". It SHOULD contain the field "version", which indicates the version of the multiscale metadata of this image (current version is 0.5-dev).

+

Each "multiscales" dictionary SHOULD contain the field "type", which gives the type of downscaling method used to generate the multiscale image pyramid. +It SHOULD contain the field "metadata", which contains a dictionary with additional information about the downscaling method.

+

If only one multiscale is provided, use it. Otherwise, the user can choose by +name, using the first multiscale as a fallback:

+
datasets = []
+for named in multiscales:
+    if named["name"] == "3D":
+        datasets = [x["path"] for x in named["datasets"]]
+        break
+if not datasets:
+    # Use the first by default. Or perhaps choose based on chunk size.
+    datasets = [x["path"] for x in multiscales[0]["datasets"]]
+
+

3.4. "omero" metadata

+

Information specific to the channels of an image and how to render it +can be found under the "omero" key in the group-level metadata:

+
"id": 1,                              # ID in OMERO
+"name": "example.tif",                # Name as shown in the UI
+"version": "0.5-dev",                 # Current version
+"channels": [                         # Array matching the c dimension size
+    {
+        "active": true,
+        "coefficient": 1,
+        "color": "0000FF",
+        "family": "linear",
+        "inverted": false,
+        "label": "LaminB1",
+        "window": {
+            "end": 1500,
+            "max": 65535,
+            "min": 0,
+            "start": 0
+        }
+    }
+],
+"rdefs": {
+    "defaultT": 0,                    # First timepoint to show the user
+    "defaultZ": 118,                  # First Z section to show the user
+    "model": "color"                  # "color" or "greyscale"
+}
+
+

See https://docs.openmicroscopy.org/omero/5.6.1/developers/Web/WebGateway.html#imgdata +for more information.

+

3.5. "labels" metadata

+

The special group "labels" found under an image Zarr contains the key labels containing +the paths to label objects which can be found underneath the group:

+
{
+  "labels": [
+    "orphaned/0"
+  ]
+}
+
+

Unlisted groups MAY be labels.

+

3.6. "image-label" metadata

+

Groups containing the image-label dictionary represent an image segmentation +in which each unique pixel value represents a separate segmented object. image-label groups MUST also contain multiscales metadata and the two +"datasets" series MUST have the same number of entries.

+

The colors key defines a list of JSON objects describing the unique label +values. Each entry in the list MUST contain the key "label-value" with the +pixel value for that label. Additionally, the "rgba" key MAY be present, the +value for which is an RGBA unsigned-int 4-tuple: [uint8, uint8, uint8, uint8] All label-values must be unique. Clients who choose to not throw an error +should ignore all except the _last_ entry.

+

Some implementations may represent overlapping labels by using a specially assigned +value, for example the highest integer available in the pixel range.

+

The properties key defines a list of JSON objects which also describes the unique +label values. Each entry in the list MUST contain the key "label-value" with the +pixel value for that label. Additionally, an arbitrary number of key-value pairs +MAY be present for each label value denoting associated metadata. Not all label +values must share the same key-value pairs within the properties list.

+

The source key is an optional dictionary which contains information on the +image the label is associated with. If included it MAY include a key image whose value is the relative path to a Zarr image group. The default value is +"../../" since most labels are stored under a subgroup named "labels/" (see +above).

+
"image-label":
+  {
+    "version": "0.5-dev",
+    "colors": [
+      {
+        "label-value": 1,
+        "rgba": [255, 255, 255, 0]
+      },
+      {
+        "label-value": 4,
+        "rgba": [0, 255, 255, 128]
+      },
+      ...
+      ],
+    "properties": [
+      {
+        "label-value": 1,
+        "area (pixels)": 1200,
+        "class": "foo"
+
+      },
+      {
+        "label-value": 4,
+        "area (pixels)": 1650
+      },
+      ...
+      ]
+  },
+  "source": {
+    "image": "../../"
+  }
+]
+
+

3.7. "plate" metadata

+

For high-content screening datasets, the plate layout can be found under the +custom attributes of the plate group under the plate key in the group-level metadata.

+

The plate dictionary MAY contain an acquisitions key whose value MUST be a list of +JSON objects defining the acquisitions for a given plate to which wells can refer to. Each +acquisition object MUST contain an id key whose value MUST be an unique integer identifier +greater than or equal to 0 within the context of the plate to which fields of view can refer +to (see #well-md). +Each acquisition object SHOULD contain a name key whose value MUST be a string identifying +the name of the acquisition. Each acquisition object SHOULD contain a maximumfieldcount key whose value MUST be a positive integer indicating the maximum number of fields of view for the +acquisition. Each acquisition object MAY contain a description key whose value MUST be a +string specifying a description for the acquisition. Each acquisition object MAY contain +a starttime and/or endtime key whose values MUST be integer epoch timestamps specifying +the start and/or end timestamp of the acquisition.

+

The plate dictionary MUST contain a columns key whose value MUST be a list of JSON objects +defining the columns of the plate. Each column object defines the properties of +the column at the index of the object in the list. Each column in the physical plate +MUST be defined, even if no wells in the column are defined. Each column object MUST +contain a name key whose value is a string specifying the column name. The name MUST +contain only alphanumeric characters, MUST be case-sensitive, and MUST NOT be a duplicate of any +other name in the columns list. Care SHOULD be taken to avoid collisions on +case-insensitive filesystems (e.g. avoid using both Aa and aA).

+

The plate dictionary SHOULD contain a field_count key whose value MUST be a positive integer +defining the maximum number of fields per view across all wells.

+

The plate dictionary SHOULD contain a name key whose value MUST be a string defining the +name of the plate.

+

The plate dictionary MUST contain a rows key whose value MUST be a list of JSON objects +defining the rows of the plate. Each row object defines the properties of +the row at the index of the object in the list. Each row in the physical plate +MUST be defined, even if no wells in the row are defined. Each defined row MUST +contain a name key whose value MUST be a string defining the row name. The name MUST +contain only alphanumeric characters, MUST be case-sensitive, and MUST NOT be a duplicate of any +other name in the rows list. Care SHOULD be taken to avoid collisions on +case-insensitive filesystems (e.g. avoid using both Aa and aA).

+

The plate dictionary SHOULD contain a version key whose value MUST be a string specifying the +version of the plate specification.

+

The plate dictionary MUST contain a wells key whose value MUST be a list of JSON objects +defining the wells of the plate. Each well object MUST contain a path key whose value MUST +be a string specifying the path to the well subgroup. The path MUST consist of a name in +the rows list, a file separator (/), and a name from the columns list, in that order. +The path MUST NOT contain additional leading or trailing directories. +Each well object MUST contain both a rowIndex key whose value MUST be an integer identifying +the index into the rows list and a columnIndex key whose value MUST be an integer indentifying +the index into the columns list. rowIndex and columnIndex MUST be 0-based. The rowIndex, columnIndex, and path MUST all refer to the same row/column pair.

+

For example the following JSON object defines a plate with two acquisitions and +6 wells (2 rows and 3 columns), containing up to 2 fields of view per acquisition.

+

The following JSON object defines a sparse plate with one acquisition and +2 wells in a 96 well plate, containing one field of view per acquisition.

+

3.8. "well" metadata

+

For high-content screening datasets, the metadata about all fields of views +under a given well can be found under the "well" key in the attributes of the +well group.

+

The well dictionary MUST contain an images key whose value MUST be a list of JSON objects +specifying all fields of views for a given well. Each image object MUST contain a path key whose value MUST be a string specifying the path to the field of view. The path MUST contain only alphanumeric characters, MUST be case-sensitive, and MUST NOT be a duplicate +of any other path in the images list. If multiple acquisitions were performed in the plate, +it MUST contain an acquisition key whose value MUST be an integer identifying the acquisition +which MUST match one of the acquisition JSON objects defined in the plate metadata (see #plate-md).

+

The well dictionary SHOULD contain a version key whose value MUST be a string specifying the +version of the well specification.

+

For example the following JSON object defines a well with four fields of +view. The first two fields of view were part of the first acquisition while +the last two fields of view were part of the second acquisition.

+

The following JSON object defines a well with two fields of view in a plate with +four acquisitions. The first field is part of the first acquisition, and the second +field is part of the last acquisition.

+

4. Specification naming style

+

Multi-word keys in this specification should use the camelCase style. +NB: some parts of the specification don’t obey this convention as they +were added before this was adopted, but they should be updated in due course.

+

5. Implementations

+

Projects which support reading and/or writing OME-NGFF data include:

+
+
bigdataviewer-ome-zarr +
Fiji-plugin for reading OME-Zarr. +
bioformats2raw +
A performant, Bio-Formats image file format converter. +
omero-ms-zarr +
A microservice for OMERO.server that converts images stored in OMERO to OME-Zarr files on the fly, served via a web API. +
idr-zarr-tools +
A full workflow demonstrating the conversion of IDR images to OME-Zarr images on S3. +
OMERO CLI Zarr plugin +
An OMERO CLI plugin that converts images stored in OMERO.server into a local Zarr file. +
ome-zarr-py +
A napari plugin for reading ome-zarr files. +
vizarr +
A minimal, purely client-side program for viewing Zarr-based images with Viv & ImJoy. +
+

Diagram of related projects

+

All implementations prevent an equivalent representation of a dataset which can be downloaded or uploaded freely. An interactive +version of this diagram is available from the OME2020 Workshop. +Mouseover the blackboxes representing the implementations above to get a quick tip on how to use them.

+

Note: If you would like to see your project listed, please open an issue or PR on the ome/ngff repository.

+

6. Citing

+

Next-generation file format (NGFF) specifications for storing bioimaging data in the cloud. J. Moore, et al. Editors. Open Microscopy Environment Consortium, 8 February 2022. +This edition of the specification is https://ngff.openmicroscopy.org/0.4/. +The latest edition is available at https://ngff.openmicroscopy.org/latest/. (doi:10.5281/zenodo.4282107)

+

7. Version History

+ + + + + + + + + + + + + +
Revision + Date + Description +
0.4.0 + 2022-02-08 + multiscales: add axes type, units and coordinateTransformations +
0.4.0 + 2022-02-08 + plate: add rowIndex/columnIndex +
0.3.0 + 2021-08-24 + Add axes field to multiscale metadata +
0.2.0 + 2021-03-29 + Change chunk dimension separator to "/" +
0.1.4 + 2020-11-26 + Add HCS specification +
0.1.3 + 2020-09-14 + Add labels specification +
0.1.2 + 2020-05-07 + Add description of "omero" metadata +
0.1.1 + 2020-05-06 + Add info on the ordering of resolutions +
0.1.0 + 2020-04-20 + First version for internal demo +
+
+
+

Conformance

+

Document conventions

+

Conformance requirements are expressed + with a combination of descriptive assertions + and RFC 2119 terminology. + The key words “MUST”, “MUST NOT”, “REQUIRED”, “SHALL”, “SHALL NOT”, “SHOULD”, “SHOULD NOT”, “RECOMMENDED”, “MAY”, and “OPTIONAL” + in the normative parts of this document + are to be interpreted as described in RFC 2119. + However, for readability, + these words do not appear in all uppercase letters in this specification.

+

All of the text of this specification is normative + except sections explicitly marked as non-normative, examples, and notes. [RFC2119]

+

Examples in this specification are introduced with the words “for example” + or are set apart from the normative text + with class="example", + like this:

+
+ +

This is an example of an informative example.

+
+

Informative notes begin with the word “Note” + and are set apart from the normative text + with class="note", + like this:

+

Note, this is an informative note.

+
+

Conformant Algorithms

+

Requirements phrased in the imperative as part of algorithms + (such as "strip any leading space characters" + or "return false and abort these steps") + are to be interpreted with the meaning of the key word + ("must", "should", "may", etc) + used in introducing the algorithm.

+

Conformance requirements phrased as algorithms or specific steps + can be implemented in any manner, + so long as the end result is equivalent. + In particular, the algorithms defined in this specification + are intended to be easy to understand + and are not intended to be performant. + Implementers are encouraged to optimize.

+
+
+ +

Index

+

Terms defined by this specification

+
    +
  • HDF5, in § 1.1 +
  • NGFF, in § 1.1 +
+

References

+

Normative References

+
+
[RFC2119] +
S. Bradner. Key words for use in RFCs to Indicate Requirement Levels. March 1997. Best Current Practice. URL: https://datatracker.ietf.org/doc/html/rfc2119 +
+

Informative References

+
+
[N5] +
John A. Bogovic; et al. N5---a scalable Java API for hierarchies of chunked n-dimensional tensors and structured meta-data. 2020. Informational. URL: https://github.com/saalfeldlab/n5/issues/62 +
[OME-ZARR-PY] +
OME; et al. ome-zarr-py: Experimental implementation of next-generation file format (NGFF) specifications for storing bioimaging data in the cloud.. 06 October 2020. Informational. URL: https://doi.org/10.5281/zenodo.4113931 +
[ZARR] +
Alistair Miles; et al. Zarr: An implementation of chunked, compressed, N-dimensional arrays for Python.. 06 October 2020. Informational. URL: https://doi.org/10.5281/zenodo.4069231 +
\ No newline at end of file From 5d6cf996830cbd553624200baf0c7c11f4dc7f73 Mon Sep 17 00:00:00 2001 From: Virginia Scarlett Date: Mon, 5 Jun 2023 17:09:31 -0400 Subject: [PATCH 14/35] add backticks around the word tables/ --- latest/index.bs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/latest/index.bs b/latest/index.bs index 0beffc31..519c82bf 100644 --- a/latest/index.bs +++ b/latest/index.bs @@ -181,16 +181,16 @@ A table is a 2-dimensional data structure consisting of rows and columns. Tables are an intuitive way of storing arbitrary data or metadata that may be large, highly structured, and may or may not be associated with an image. -In OME-NGFF, a table is a Zarr group containing one or more Zarr arrays, where each +In OME-NGFF, a table is a Zarr group containing zero, one, or more Zarr arrays, where each array represents one column of the table. Columns are ordered, and each column in a table MUST have the same number of rows. While the table itself MUST be 2-dimensional, the columns need not be. 1-dimensional columns will be typical. Nevertheless, there may be use cases in which it is sensible to conceptualize a 2D, 3D, or higher-dimensional array as a single column. -Tables in an OME-NGFF file are located in the tables/ directory, in the root of an image's +Tables in an OME-NGFF file are located in the `tables/` directory, in the root of an image's Zarr group, alongside the labels/ directory if one is present. The .zattrs file immediately -within tables/ MUST contain the "annotated-data" property, as shown below: +within `tables/` MUST contain the "annotated-data" property, as shown below: ```json "annotated-data": [ # A JSON array containing all array / dimension pairs this table is annotating @@ -216,14 +216,14 @@ to the length of the dimension that is being annotated in the source array. If ` exactly one entry, the first dimension of all arrays in the table group SHOULD be chunked in the same way as the corresponding dimension of the source array. -Note: The AnnData data model is based on a main array with additional 'annotation' +Note: The AnnData data model is based on a main array with additional "annotation" tables that have the same number of rows, columns, or both, as the main table. AnnData -objects can be stored in the tables/ directory of an OME-NGFF file. See the AnnData +objects can be stored in the `tables/` directory of an OME-NGFF file. See the AnnData documentation for detailed recommendations on formatting AnnData data structures within Zarr. -There MAY be one or more intermediate directories between tables/ and a particular table. +There MAY be one or more intermediate directories between `tables/` and a particular table. These SHOULD NOT contain metadata, unless the intermediate directory represents the parent -directory for an AnnData object. The names of directories beneath the tables/ +directory for an AnnData object. The names of directories beneath the `tables/` directory are arbitrary, except in the AnnData case. From 2639d9ebcfcb2d8eca49cc10d31219c2a067ce3c Mon Sep 17 00:00:00 2001 From: Virginia Scarlett Date: Mon, 5 Jun 2023 17:10:48 -0400 Subject: [PATCH 15/35] update index.html --- index.html | 6 ------ latest/index.html | 16 ++++++++-------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/index.html b/index.html index 74f803c4..e69de29b 100644 --- a/index.html +++ b/index.html @@ -1,6 +0,0 @@ - - - - -

If you are not redirected in five seconds, click here.

- diff --git a/latest/index.html b/latest/index.html index 67cc9513..4f06af21 100644 --- a/latest/index.html +++ b/latest/index.html @@ -743,15 +743,15 @@