diff --git a/NESP/README.md b/NESP/README.md index bf45380..5fc72d9 100644 --- a/NESP/README.md +++ b/NESP/README.md @@ -1,43 +1,140 @@ -# Intro +# NESP 5.9 — User Code Library -## Pre-requisites -Python package and environment manager. We highly recommend [uv](https://docs.astral.sh/uv/#installation). +This directory contains notebooks and utilities for loading, exploring, and analysing the **NESP 5.9 marine datasets** stored in AWS S3. -## Usage -1. Navigate to the NESP directory of the repo +Each dataset has its own subdirectory with a dedicated README, a Python notebook (`.ipynb`), and an R notebook (`.Rmd`). -2. Create and activate the python environment +--- + +## Datasets + +| Dataset | Description | Folder | S3 Key | +|---------|-------------|--------|--------| +| AMSA | AIS Vessel Tracking (2012–2025) | [`amsa/`](amsa/) | `stored/datauplift/amsa/year=*/source=*/*.parquet` | +| Kelp | Squidle+ Kelp Annotations | [`kelp/`](kelp/) | `stored/datauplift/kelp/kelp.parquet` | +| NRMN | NRMN Reef Life Surveys | [`nrmn/`](nrmn/) | `stored/datauplift/nrmn/nrmn.parquet` | +| Seabird | Seabird Observations and Tracking | [`seabird/`](seabird/) | `stored/datauplift/seabird/seabird.parquet` | +| Seagrass | Seagrass Surveys | [`seagrass/`](seagrass/) | `stored/datauplift/seagrass/seagrass.parquet` | +| Dugongs | Dugong Sightings and Distribution | [`dugongs/`](dugongs/) | `stored/datauplift/dugongs/dugongs.parquet` | + +All datasets are publicly accessible in the `data-uplift-public` S3 bucket (region: `ap-southeast-2`). + +--- + +## Repository Structure + +``` +NESP/ +├── nesp/ +│ ├── __init__.py +│ └── util.py # Shared utilities (H3 mapping, color scales, schema display) +├── h3.md # H3 hexagonal indexing reference +├── pyproject.toml # Python project and dependency definition +├── requirements.txt # Pinned Python dependencies +├── uv.lock # uv lockfile +├── OffshoreRenewable_Energy_Infrastructure_Regions.zip # Shapefile used in spatial analyses +│ +├── amsa/ +│ ├── README.md +│ ├── amsa.ipynb +│ └── amsa.Rmd +├── kelp/ +│ └── README.md +├── nrmn/ +│ └── README.md +├── seabird/ +│ ├── README.md +│ ├── seabird.ipynb +│ └── seabird.Rmd +├── seagrass/ +│ ├── README.md +│ ├── seagrass.ipynb +│ └── seagrass.Rmd +└── dugongs/ + └── README.md +``` + +--- + +## Python Environment Setup + +Python 3.12 or later is required. We recommend [uv](https://docs.astral.sh/uv/) for environment and package management. + +### With `uv` (recommended) + +```bash +# From the NESP/ directory: +uv venv && source .venv/bin/activate && uv pip install . +``` + +### With `pip` ```bash -uv venv \ -&& source .venv/bin/activate \ -&& uv pip install . +python -m venv .venv && source .venv/bin/activate && pip install . ``` -3. Launch the jupyter server +### Running notebooks ```bash jupyter notebook ``` - ## NESP 5.9 Datasets - -| Dataset name | Description | Metadata | S3 URL | -| ------------ | ----------- | -------- | ------ | -| AMSA | AMSA Vessel Tracking | - | [link](s3://data-uplift-public/stored/datauplift/amsa/) | -| Kelp | Squidle+ Kelp Annotations | - | [link](s3://data-uplift-public/stored/datauplift/kelp/kelp.parquet) | -| NRMN | NRMN Reef Life Surveys | - | [link](s3://data-uplift-public/stored/datauplift/nrmn/nrmn.parquet) | -| Seabird | Seabird Observations and Tracking | - | [link](s3://data-uplift-public/stored/datauplift/seabird/seabird.parquet) | -| Seagrass | Seagrass Surveys | - | [link](s3://data-uplift-public/stored/datauplift/seagrass/seagrass.parquet) | - -### S3 Details\n" -NESP 5.9 datasets are currently stored in S3 and are publicly available as per the following table:\n" - -| Bucket | Key | Partitioned | -| -------------------- | -------------------------------------------------------- | ----------- | -| `data-uplift-public` | `stored/datauplift/amsa/year=*/source=*/*.parquet` | [x] | -| `data-uplift-public` | `stored/datauplift/kelp/kelp.parquet` | [ ] | -| `data-uplift-public` | `stored/datauplift/nrmn/nrmn.parquet` | [ ] |\n", -| `data-uplift-public` | `stored/datauplift/seabird/seabird.parquet` | [ ] | -| `data-uplift-public` | `stored/datauplift/seagrass/seagrass.parquet` | [ ] | +Or open any `.ipynb` in your IDE and select the `.venv` as the kernel. + +### Key Python packages + +| Package | Purpose | +|---------|---------| +| `pyarrow` | Parquet I/O and S3 dataset connection | +| `polars` | DataFrame and LazyFrame computation | +| `polars-h3` | H3 spatial indexing within Polars | +| `polars-st` | Spatial (geometry) operations in Polars | +| `pydeck` | GPU-accelerated H3 hexagon map rendering | +| `h3` | H3 indexing for polygon-to-cell conversion | +| `geopandas` | Shapefile loading and CRS handling | +| `matplotlib` / `seaborn` | Statistical plotting | +| `rich` | Schema and table display in notebooks | + +> **Note:** Shared utilities live in the `nesp` package (`nesp/util.py`) and are installed when you run `uv pip install .` or `pip install .`. Notebooks import them as `from nesp import util` — no path manipulation needed. + +--- + +## R Environment Setup + +R 4.x or later is required. Install the following packages from CRAN and GitHub before running any `.Rmd` notebook. + +### CRAN packages + +```r +install.packages(c( + "arrow", # S3 dataset connection and Parquet I/O + "sf", # Spatial features + "dplyr", # Data manipulation + "tidyr", # Data reshaping + "stringr", # String operations + "lubridate", # Date/time handling + "ggplot2", # Plotting + "leaflet" # Interactive maps +)) +``` + +### H3 for R (from GitHub) + +The `h3-r` package is recommended as it bundles the underlying C library automatically: + +```r +# install.packages("remotes") +remotes::install_github("crazycapivara/h3-r") +``` + +> See the [h3-r documentation](https://crazycapivara.github.io/h3-r/articles/h3.html) for usage examples. + +### Running notebooks + +Open any `.Rmd` file in RStudio and click **Knit**, or run chunks interactively. + +--- + +## H3 Spatial Indexing +All datasets are spatially aggregated using the [Uber H3](https://h3geo.org/) hexagonal grid system. See [`h3.md`](h3.md) for an overview of H3 concepts, resolution levels, and why hexagons are used over traditional grids. diff --git a/NESP/amsa/README.md b/NESP/amsa/README.md new file mode 100644 index 0000000..0331201 --- /dev/null +++ b/NESP/amsa/README.md @@ -0,0 +1,26 @@ +# AMSA — Vessel Tracking + +## Description +AIS (Automatic Identification System) vessel tracking records collected by the Australian Maritime Safety Authority (AMSA), covering Australian waters from 2012 to 2025. + +## Dataset Details + +| Property | Value | +|-------------|-------| +| Bucket | `data-uplift-public` | +| Key | `stored/datauplift/amsa/year=*/source=*/*.parquet` | +| Partitioned | Yes (by year and source) | +| Format | Parquet | + +## Notebooks + +| Notebook | Language | Description | +|----------|----------|-------------| +| `amsa.ipynb` | Python | Vessel density mapping with H3 aggregation, longitudinal data health analysis, and regional deep-dives (e.g. Sydney Harbour) | +| `amsa.Rmd` | R | Equivalent analysis using the `arrow`, `h3`, and `ggplot2` R packages | + +## Dataset-Specific Notes +- The full dataset exceeds typical RAM limits. The Python notebook uses a Polars `LazyFrame` and streaming aggregation to avoid loading all data into memory. +- Vessel density follows a power-law distribution. The Python notebook applies a log₁₀ transform before coloring to make both busy ports and open-ocean routes visible. +- Records include an `australianMarineRegionsTags` column for pre-computed spatial region labels. +- The AMSA dataset is already indexed at H3 Resolution 8 (`h3Index` column). For finer-grained regional analysis, re-index to Resolution 9–12. diff --git a/NESP/amsa.Rmd b/NESP/amsa/amsa.Rmd similarity index 100% rename from NESP/amsa.Rmd rename to NESP/amsa/amsa.Rmd diff --git a/NESP/amsa.ipynb b/NESP/amsa/amsa.ipynb similarity index 99% rename from NESP/amsa.ipynb rename to NESP/amsa/amsa.ipynb index 2c13417..34dd09c 100644 --- a/NESP/amsa.ipynb +++ b/NESP/amsa/amsa.ipynb @@ -160,7 +160,7 @@ "import pyarrow\n", "import pyarrow.fs\n", "import pyarrow.dataset\n", - "import util\n", + "from nesp import util\n", "\n", "# --- FileSystem Configuration ---\n", "# Initialize S3FileSystem for ap-southeast-2 region.\n", @@ -185,7 +185,7 @@ "rich.print('Total Inspected files: ',len(list(ds.get_fragments())))\n", "\n", "# Display schema definition and units.\n", - "schema_rich_table = util.generate_schema_rich_table(\n", + "schema_rich_table = util.print_schema_rich_table(\n", " schema=ds.schema,\n", " metadata_keys=[\n", " \"definition\",\n", diff --git a/NESP/dugongs/README.md b/NESP/dugongs/README.md new file mode 100644 index 0000000..a7af77f --- /dev/null +++ b/NESP/dugongs/README.md @@ -0,0 +1,22 @@ +# Dugongs + +## Description +Dugong (*Dugong dugon*) sighting and distribution records from Australian marine surveys. Dugongs are a protected marine mammal and an indicator species for seagrass ecosystem health. + +## Dataset Details + +| Property | Value | +|-------------|-------| +| Bucket | `data-uplift-public` | +| Key | `stored/datauplift/dugongs/dugongs.parquet` *(placeholder — confirm S3 path)* | +| Partitioned | No | +| Format | Parquet | + +## Notebooks + +> Notebooks for this dataset are yet to be created. Use the existing `amsa`, `seabird`, or `seagrass` notebooks as a reference template. + +| Notebook | Language | Description | +|----------|----------|-------------| +| `dugongs.ipynb` | Python | *(to be created)* | +| `dugongs.Rmd` | R | *(to be created)* | diff --git a/NESP/kelp/README.md b/NESP/kelp/README.md new file mode 100644 index 0000000..b6055f8 --- /dev/null +++ b/NESP/kelp/README.md @@ -0,0 +1,22 @@ +# Kelp — Squidle+ Annotations + +## Description +Kelp presence/absence annotations sourced from the Squidle+ platform, covering benthic imagery surveys across Australian coastal waters. + +## Dataset Details + +| Property | Value | +|-------------|-------| +| Bucket | `data-uplift-public` | +| Key | `stored/datauplift/kelp/kelp.parquet` | +| Partitioned | No | +| Format | Parquet | + +## Notebooks + +> Notebooks for this dataset are yet to be created. Use the existing `seagrass` or `seabird` notebooks as a reference template. + +| Notebook | Language | Description | +|----------|----------|-------------| +| `kelp.ipynb` | Python | *(to be created)* | +| `kelp.Rmd` | R | *(to be created)* | diff --git a/NESP/nesp/__init__.py b/NESP/nesp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/NESP/util.py b/NESP/nesp/util.py similarity index 100% rename from NESP/util.py rename to NESP/nesp/util.py diff --git a/NESP/nrmn/README.md b/NESP/nrmn/README.md new file mode 100644 index 0000000..a4a925a --- /dev/null +++ b/NESP/nrmn/README.md @@ -0,0 +1,22 @@ +# NRMN — Reef Life Surveys + +## Description +Reef Life Survey (RLS) data from the National Reef Monitoring Network (NRMN), covering fish and invertebrate abundance, species richness, and community composition across Australian reef systems. + +## Dataset Details + +| Property | Value | +|-------------|-------| +| Bucket | `data-uplift-public` | +| Key | `stored/datauplift/nrmn/nrmn.parquet` | +| Partitioned | No | +| Format | Parquet | + +## Notebooks + +> Notebooks for this dataset are yet to be created. Use the existing `seabird` or `seagrass` notebooks as a reference template. + +| Notebook | Language | Description | +|----------|----------|-------------| +| `nrmn.ipynb` | Python | *(to be created)* | +| `nrmn.Rmd` | R | *(to be created)* | diff --git a/NESP/pyproject.toml b/NESP/pyproject.toml index f82d133..ed5b7f8 100644 --- a/NESP/pyproject.toml +++ b/NESP/pyproject.toml @@ -1,3 +1,10 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["nesp"] + [project] name = "nesp" version = "1.0.0" diff --git a/NESP/seabird/README.md b/NESP/seabird/README.md new file mode 100644 index 0000000..ecd8c6e --- /dev/null +++ b/NESP/seabird/README.md @@ -0,0 +1,24 @@ +# Seabird — Observations and Tracking + +## Description +Seabird observation and tracking records aggregated from Australian monitoring programs. Covers species occurrence, movement, and spatial distribution across Australian marine environments. + +## Dataset Details + +| Property | Value | +|-------------|-------| +| Bucket | `data-uplift-public` | +| Key | `stored/datauplift/seabird/seabird.parquet` | +| Partitioned | No | +| Format | Parquet | + +## Notebooks + +| Notebook | Language | Description | +|----------|----------|-------------| +| `seabird.ipynb` | Python | Data exploration, H3 spatial aggregation, and mapping of seabird distribution | +| `seabird.Rmd` | R | Equivalent analysis using the `arrow`, `h3`, and `ggplot2` R packages | + +## Authors +- Thomas Galindo (Python notebook) +- Denisse Fierro Arcos (R translation) diff --git a/NESP/seabird.Rmd b/NESP/seabird/seabird.Rmd similarity index 100% rename from NESP/seabird.Rmd rename to NESP/seabird/seabird.Rmd diff --git a/NESP/seabird.ipynb b/NESP/seabird/seabird.ipynb similarity index 97% rename from NESP/seabird.ipynb rename to NESP/seabird/seabird.ipynb index a7c90f2..0583c93 100644 --- a/NESP/seabird.ipynb +++ b/NESP/seabird/seabird.ipynb @@ -136,44 +136,7 @@ "id": "6", "metadata": {}, "outputs": [], - "source": [ - "import pyarrow\n", - "import pyarrow.fs\n", - "import pyarrow.dataset\n", - "import util\n", - "\n", - "# Construct the anonymous file system responsible for reading from the public S3 bucket\n", - "FILE_SYSTEM = pyarrow.fs.S3FileSystem(\n", - " region=\"ap-southeast-2\", \n", - " anonymous=True,\n", - ")\n", - "\n", - "# Create the dataset connection\n", - "# By convention, datasets are labelled `ds`\n", - "ds = pyarrow.dataset.dataset(\n", - " source=\"data-uplift-public/stored/datauplift/seabird/seabird.parquet\",\n", - " filesystem=FILE_SYSTEM,\n", - ")\n", - "\n", - "# The comprising files of the dataset can be inspected\n", - "print(list(ds.get_fragments()))\n", - "\n", - "# The schema of the dataset can be inspected\n", - "util.print_schema_rich_table(\n", - " schema=ds.schema,\n", - " metadata_keys=[\"definition\", \"units\", \"long_name\", \"resolution\"]\n", - ")\n", - "\n", - "# For the full table scheme use native pyarrow representation\n", - "# rich.print(ds.schema)\n", - "\n", - "# Estimate dataset size\n", - "estimated_megabytes = util.estimate_dataset_size(\n", - " ds=ds,\n", - " n_samples=10_000,\n", - ")\n", - "print(f\"DS estimated size: {round(estimated_megabytes, 2)}MB\")" - ] + "source": "import pyarrow\nimport pyarrow.fs\nimport pyarrow.dataset\nfrom nesp import util\n\n# Construct the anonymous file system responsible for reading from the public S3 bucket\nFILE_SYSTEM = pyarrow.fs.S3FileSystem(\n region=\"ap-southeast-2\", \n anonymous=True,\n)\n\n# Create the dataset connection\n# By convention, datasets are labelled `ds`\nds = pyarrow.dataset.dataset(\n source=\"data-uplift-public/stored/datauplift/seabird/seabird.parquet\",\n filesystem=FILE_SYSTEM,\n)\n\n# The comprising files of the dataset can be inspected\nprint(list(ds.get_fragments()))\n\n# The schema of the dataset can be inspected\nutil.print_schema_rich_table(\n schema=ds.schema,\n metadata_keys=[\"definition\", \"units\", \"long_name\", \"resolution\"]\n)\n\n# For the full table scheme use native pyarrow representation\n# rich.print(ds.schema)\n\n# Estimate dataset size\nestimated_megabytes = util.estimate_dataset_size(\n ds=ds,\n n_samples=10_000,\n)\nprint(f\"DS estimated size: {round(estimated_megabytes, 2)}MB\")" }, { "cell_type": "markdown", @@ -1332,4 +1295,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/NESP/seagrass/README.md b/NESP/seagrass/README.md new file mode 100644 index 0000000..dabeff2 --- /dev/null +++ b/NESP/seagrass/README.md @@ -0,0 +1,24 @@ +# Seagrass — Surveys + +## Description +Seagrass survey records from Australian coastal and marine environments, covering species presence, cover, and site-level habitat assessments. + +## Dataset Details + +| Property | Value | +|-------------|-------| +| Bucket | `data-uplift-public` | +| Key | `stored/datauplift/seagrass/seagrass.parquet` | +| Partitioned | No | +| Format | Parquet | + +## Notebooks + +| Notebook | Language | Description | +|----------|----------|-------------| +| `seagrass.ipynb` | Python | Data exploration, H3 spatial aggregation, and mapping of seagrass survey sites | +| `seagrass.Rmd` | R | Equivalent analysis using the `arrow`, `h3`, and `ggplot2` R packages | + +## Authors +- Thomas Galindo (Python notebook) +- Denisse Fierro Arcos (R translation) diff --git a/NESP/seagrass.Rmd b/NESP/seagrass/seagrass.Rmd similarity index 100% rename from NESP/seagrass.Rmd rename to NESP/seagrass/seagrass.Rmd diff --git a/NESP/seagrass.ipynb b/NESP/seagrass/seagrass.ipynb similarity index 96% rename from NESP/seagrass.ipynb rename to NESP/seagrass/seagrass.ipynb index 0b26d8f..f10c128 100644 --- a/NESP/seagrass.ipynb +++ b/NESP/seagrass/seagrass.ipynb @@ -137,44 +137,7 @@ "id": "6", "metadata": {}, "outputs": [], - "source": [ - "import pyarrow\n", - "import pyarrow.fs\n", - "import pyarrow.dataset\n", - "import util\n", - "\n", - "# Construct the anonymous file system responsible for reading from the public S3 bucket\n", - "FILE_SYSTEM = pyarrow.fs.S3FileSystem(\n", - " region=\"ap-southeast-2\", \n", - " anonymous=True,\n", - ")\n", - "\n", - "# Create the dataset connection\n", - "# By convention, datasets are labelled `ds`\n", - "ds = pyarrow.dataset.dataset(\n", - " source=\"data-uplift-public/stored/datauplift/seagrass/seagrass.parquet\",\n", - " filesystem=FILE_SYSTEM,\n", - ")\n", - "\n", - "# The comprising files of the dataset can be inspected\n", - "print(list(ds.get_fragments()))\n", - "\n", - "# The schema of the dataset can be inspected\n", - "util.print_schema_rich_table(\n", - " schema=ds.schema,\n", - " metadata_keys=[\"definition\", \"units\", \"long_name\", \"resolution\"]\n", - ")\n", - "\n", - "# For the full table scheme use native pyarrow representation\n", - "# rich.print(ds.schema)\n", - "\n", - "# Estimate dataset size\n", - "estimated_megabytes = util.estimate_dataset_size(\n", - " ds=ds,\n", - " n_samples=10_000,\n", - ")\n", - "print(f\"DS estimated size: {round(estimated_megabytes, 2)}MB\")" - ] + "source": "import pyarrow\nimport pyarrow.fs\nimport pyarrow.dataset\nfrom nesp import util\n\n# Construct the anonymous file system responsible for reading from the public S3 bucket\nFILE_SYSTEM = pyarrow.fs.S3FileSystem(\n region=\"ap-southeast-2\", \n anonymous=True,\n)\n\n# Create the dataset connection\n# By convention, datasets are labelled `ds`\nds = pyarrow.dataset.dataset(\n source=\"data-uplift-public/stored/datauplift/seagrass/seagrass.parquet\",\n filesystem=FILE_SYSTEM,\n)\n\n# The comprising files of the dataset can be inspected\nprint(list(ds.get_fragments()))\n\n# The schema of the dataset can be inspected\nutil.print_schema_rich_table(\n schema=ds.schema,\n metadata_keys=[\"definition\", \"units\", \"long_name\", \"resolution\"]\n)\n\n# For the full table scheme use native pyarrow representation\n# rich.print(ds.schema)\n\n# Estimate dataset size\nestimated_megabytes = util.estimate_dataset_size(\n ds=ds,\n n_samples=10_000,\n)\nprint(f\"DS estimated size: {round(estimated_megabytes, 2)}MB\")" }, { "cell_type": "markdown", @@ -1148,4 +1111,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/NESP/uv.lock b/NESP/uv.lock index 27263c6..a0854ac 100644 --- a/NESP/uv.lock +++ b/NESP/uv.lock @@ -1234,7 +1234,7 @@ wheels = [ [[package]] name = "nesp" version = "1.0.0" -source = { virtual = "." } +source = { editable = "." } dependencies = [ { name = "altair" }, { name = "geopandas" },