From ee0426e6541c48a14e6cf70af0122fff4c80b06e Mon Sep 17 00:00:00 2001 From: Iman Akbari Date: Thu, 23 Feb 2023 02:04:12 -0500 Subject: [PATCH 1/3] Revamp weather-mv docs for Docker image release --- weather_mv/Examples.md | 272 +++++++++++++++++ weather_mv/README.md | 650 +++++++++++++++-------------------------- 2 files changed, 510 insertions(+), 412 deletions(-) create mode 100644 weather_mv/Examples.md diff --git a/weather_mv/Examples.md b/weather_mv/Examples.md new file mode 100644 index 00000000..29f29573 --- /dev/null +++ b/weather_mv/Examples.md @@ -0,0 +1,272 @@ +# Weather Mover Examples + +## BigQuery + + +Using the subcommand alias `bq`: + +```bash +weather-mv bq --uris "gs://your-bucket/*.nc" \ + --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ + --temp_location "gs://$BUCKET/tmp" \ # Needed for batch writes to BigQuery + --direct_num_workers 2 +``` + +Preview load with a dry run: + +```bash +weather-mv bq --uris "gs://your-bucket/*.nc" \ + --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ + --temp_location "gs://$BUCKET/tmp" \ # Needed for batch writes to BigQuery + --direct_num_workers 2 \ + --dry-run +``` + +Load COG's (.tif) files: + +```bash +weather-mv bq --uris "gs://your-bucket/*.tif" \ + --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ + --temp_location "gs://$BUCKET/tmp" \ # Needed for batch writes to BigQuery + --direct_num_workers 2 \ + --tif_metadata_for_datetime start_time +``` + +Upload only a subset of variables: + +```bash +weather-mv bq --uris "gs://your-bucket/*.nc" \ + --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ + --variables u10 v10 t + --temp_location "gs://$BUCKET/tmp" \ + --direct_num_workers 2 +``` + +Upload all variables, but for a specific geographic region (for example, the +continental US): + +```bash +weather-mv bq --uris "gs://your-bucket/*.nc" \ + --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ + --area 49 -124 24 -66 \ + --temp_location "gs://$BUCKET/tmp" \ + --direct_num_workers 2 +``` + +Control how weather data is opened with XArray: + +```bash +weather-mv bq --uris "gs://your-bucket/*.grib" \ + --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ + --xarray_open_dataset_kwargs '{"engine": "cfgrib", "indexpath": "", "backend_kwargs": {"filter_by_keys": {"typeOfLevel": "surface", "edition": 1}}}' \ + --temp_location "gs://$BUCKET/tmp" \ + --direct_num_workers 2 +``` + +Using DataflowRunner: + +```bash +weather-mv bq --uris "gs://your-bucket/*.nc" \ + --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ + --runner DataflowRunner \ + --project $PROJECT \ + --region $REGION \ + --temp_location "gs://$BUCKET/tmp" \ + --job_name $JOB_NAME +``` + +For a full list of how to configure the Dataflow pipeline, please review +[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options). + +## Regrid + +Using the subcomand alias 'rg': + +```bash +weather-mv rg --uris "gs://your-bucket/*.gb" \ + --output_path "gs://regrid-bucket/" + +``` + +Preview regrid with a dry run: + +```bash +weather-mv rg --uris "gs://your-bucket/*.gb" \ + --output_path "gs://regrid-bucket/" \ + --dry-run + +``` + +Interpolate to a finer grid resolution: + +```bash +weather-mv rg --uris "gs://your-bucket/*.gb" \ + --output_path "gs://regrid-bucket/" \ + --regrid_kwargs '{"grid": [0.1, 0.1]}'. + +``` + +Interpolate to a high-resolution octahedral gaussian grid: + +```bash +weather-mv rg --uris "gs://your-bucket/*.gb" \ + --output_path "gs://regrid-bucket/" \ + --regrid_kwargs '{"grid": "O1280}'. + +``` + +Convert gribs to NetCDF on copy: + +```bash +weather-mv rg --uris "gs://your-bucket/*.gb" \ + --output_path "gs://regrid-bucket/" \ + --to_netcdf +``` + +Using DataflowRunner: + +```bash +weather-mv rg --uris "gs://your-bucket/*.nc" \ + --output_path "gs://regrid-bucket/" \ + --runner DataflowRunner \ + --project $PROJECT \ + --region $REGION \ + --temp_location "gs://$BUCKET/tmp" \ + --experiment=use_runner_v2 \ + --sdk_container_image="gcr.io/$PROJECT/$REPO:latest" \ + --job_name $JOB_NAME +``` + +Using DataflowRunner, with added disk per VM: + +```bash +weather-mv rg --uris "gs://your-bucket/*.nc" \ + --output_path "gs://regrid-bucket/" \ + --runner DataflowRunner \ + --project $PROJECT \ + --region $REGION \ + --disk_size_gb 250 \ + --temp_location "gs://$BUCKET/tmp" \ + --experiment=use_runner_v2 \ + --sdk_container_image="gcr.io/$PROJECT/$REPO:latest" \ + --job_name $JOB_NAME +``` + +## Earth Engine + +Using the subcommand alias `ee`: + +```bash +weather-mv ee --uris "gs://your-bucket/*.grib" \ + --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib + --ee_asset "projects/$PROJECT/assets/test_dir" +``` + +Preview ingestion with a dry run: + +```bash +weather-mv ee --uris "gs://your-bucket/*.grib" \ + --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib + --ee_asset "projects/$PROJECT/assets/test_dir" \ + --dry-run +``` + +Authenticate earth engine using personal account: + +```bash +weather-mv ee --uris "gs://your-bucket/*.grib" \ + --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib + --ee_asset "projects/$PROJECT/assets/test_dir" \ + --use_personal_account +``` + +Authenticate earth engine using a private key: + +```bash +weather-mv ee --uris "gs://your-bucket/*.grib" \ + --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib + --ee_asset "projects/$PROJECT/assets/test_dir" \ + --service_account "my-service-account@...gserviceaccount.com" \ + --private_key "path/to/private_key.json" +``` + +Ingest asset as table in earth engine: + +```bash +weather-mv ee --uris "gs://your-bucket/*.grib" \ + --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib + --ee_asset "projects/$PROJECT/assets/test_dir" \ + --ee_asset_type "TABLE" +``` + +Restrict merging all bands or grib normalization: + +```bash +weather-mv ee --uris "gs://your-bucket/*.grib" \ + --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib + --ee_asset "projects/$PROJECT/assets/test_dir" \ + --disable_grib_schema_normalization +``` + +Control how weather data is opened with XArray: + +```bash +weather-mv ee --uris "gs://your-bucket/*.grib" \ + --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib + --ee_asset "projects/$PROJECT/assets/test_dir" + --xarray_open_dataset_kwargs '{"engine": "cfgrib", "indexpath": "", "backend_kwargs": {"filter_by_keys": {"typeOfLevel": "surface", "edition": 1}}}' \ + --temp_location "gs://$BUCKET/tmp" +``` + +Limit EE requests: + +```bash +weather-mv ee --uris "gs://your-bucket/*.grib" \ + --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib + --ee_asset "projects/$PROJECT/assets/test_dir" \ + --ee_qps 10 \ + --ee_latency 0.5 \ + --ee_max_concurrent 10 +``` + +Custom Band names: + +```bash +weather-mv ee --uris "gs://your-bucket/*.tif" \ + --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif + --ee_asset "projects/$PROJECT/assets/test_dir" \ + --band_names_mapping "filename.json" +``` + +Getting initialization and forecast/end date-time from the filename: + +```bash +weather-mv ee --uris "gs://your-bucket/*.tif" \ + --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif + --ee_asset "projects/$PROJECT/assets/test_dir" \ + --initialization_time_regex "$REGEX" \ + --forecast_time_regex "$REGEX" +``` + +Example: + +```bash +weather-mv ee --uris "gs://tmp-gs-bucket/3B-HHR-E_MS_MRG_3IMERG_20220901-S000000-E002959_0000_V06C_30min.tiff" \ + --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif + --ee_asset "projects/$PROJECT/assets/test_dir" \ + --initialization_time_regex "3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S%H%M%S-*tiff" \ + --forecast_time_regex "3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S*-E%H%M%S*tiff" +``` + +Using DataflowRunner: + +```bash +weather-mv ee --uris "gs://your-bucket/*.grib" \ + --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib + --ee_asset "projects/$PROJECT/assets/test_dir" + --runner DataflowRunner \ + --project $PROJECT \ + --region $REGION \ + --temp_location "gs://$BUCKET/tmp" \ + --job_name $JOB_NAME +``` diff --git a/weather_mv/README.md b/weather_mv/README.md index b2e3dd91..7d60677f 100644 --- a/weather_mv/README.md +++ b/weather_mv/README.md @@ -1,27 +1,47 @@ -# ⛅️ `weather-mv` – Weather Mover - -Weather Mover loads weather data from cloud storage into analytics engines, -like [Google BigQuery](https://cloud.google.com/bigquery) (_alpha_). - -## Features - -* **Rapid Querability**: After geospatial data is in BigQuery, data wranging becomes as simple as writing SQL. This - allows for rapid data exploration, visualization, and model pipeline prototyping. -* **Simple Versioning**: All rows in the table come with a `data_import_time` column. This provides some notion of how - the data is versioned. Downstream analysis can adapt to data ingested at differen times by updating a `WHERE` clause. -* **Parallel Upload**: Each file will be processed in parallel. With Dataflow autoscaling, even large datasets can be - processed in a reasonable amount of time. -* **Streaming support**: When running the mover in streaming mode, it will automatically process files as they appear in - cloud buckets via PubSub. -* _(new)_ **Grib Regridding**: `weather-mv regrid` uses [MetView](https://metview.readthedocs.io/en/latest/) to +# ⛅️ `weather-mv` – Weather Mover _(alpha)_ + +Weather Mover provides an easy and scalable way for: + +1. Loading weather datasets + from Cloud Storage into [BigQuery](https://cloud.google.com/bigquery) and + [Google Earth Engine](https://earthengine.google.com/) +2. Fast [interpolation & grid conversions](https://metview.readthedocs.io/en/latest/gen_files/icon_functions/regrid.html) +on large weather datasets + +It supports popular multi-dimensional data formats including NetCDF, GRIB, +GeoTIFF, and all other formats [supported by Xarray](https://docs.xarray.dev/en/stable/user-guide/io.html). + +## Why do we need this? + +* **Upload Speed**: Files in a weather dataset will be processed in parallel. + With [Dataflow autoscaling](https://cloud.google.com/dataflow/docs/horizontal-autoscaling), + even large datasets can be transferred to analytical engines in a + reasonable amount of time. +* **Rapid Querying**: After your geospatial data is in BigQuery, data wrangling + becomes as simple as writing SQL. This + allows for rapid data exploration, visualization, and model pipeline + prototyping. +* **Simple Versioning**: All rows in the destination BigQuery table come with + a `data_import_time` + column. This provides some notion of how + the data is versioned. Downstream analysis can adapt to data ingested at + different times by updating a `WHERE` clause. +* **Streaming Support**: Weather Mover supports a streaming mode where it + automatically processes files as they appear in cloud buckets via PubSub. +* **Scalable Regridding**: The regrid option + uses [MetView](https://metview.readthedocs.io/en/latest/) to interpolate Grib files to a [range of grids.](https://metview.readthedocs.io/en/latest/metview/using_metview/regrid_intro.html?highlight=grid#grid) -* _(new)_ **Earth Engine Ingestion**: `weather-mv earthengine` ingests weather data into [Google Earth Engine](https://earthengine.google.com/). + To learn more about its use cases, + visit [MetView docs](https://metview.readthedocs.io/en/latest/metview/using_metview/regrid_intro.html#regrid-explained). +* **Earth Engine**: By ingesting weather data into [Google Earth Engine](https://earthengine.google.com/), + research teams can use its powerful [APIs and App Development interface](https://developers.google.com/earth-engine) + for visualization and analysis of geospatial data. ## Usage ``` -usage: weather-mv [-h] {bigquery,bq,regrid,rg} ... +usage: weather-mv [-h] {bigquery,bq,regrid,rg,earthengine,ee} ... Weather Mover loads weather data from cloud storage into analytics engines. @@ -36,21 +56,27 @@ optional arguments: -h, --help show this help message and exit ``` -The weather mover makes use of subcommands to distinguish between tasks. The above tasks are currently supported. +The three main subcommands are: -_Common options_ +* `bigquery` or `bq`: Ingest data from Cloud Storage into BigQuery +* `earthengine` or `ee`: Ingest data from Cloud Storage into Earth Engine +* `regrid` or `rg`: Regrid data in Cloud Storage using MetView -* `-i, --uris`: (required) URI glob pattern matching input weather data, e.g. 'gs://ecmwf/era5/era5-2015-*.gb'. -* `--topic`: A Pub/Sub topic for GCS OBJECT_FINALIZE events, or equivalent, of a cloud bucket. E.g. - 'projects//topics/'. -* `--window_size`: Output file's window size in minutes. Only used with the `topic` flag. Default: 1.0 minute. -* `--num_shards`: Number of shards to use when writing windowed elements to cloud storage. Only used with the `topic` - flag. Default: 5 shards. -* `-d, --dry-run`: Preview the load into BigQuery. Default: off. +### General Flags + +* `-i, --uris`: (required) URI glob pattern matching input weather data, e.g. ` + gs://ecmwf/era5/era5-2015-*.gb`. +* `--topic`: A Pub/Sub topic for GCS `OBJECT_FINALIZE` [events](https://cloud.google.com/storage/docs/pubsub-notifications#events), + e.g. `projects//topics/`. +* `--window_size`: Output file's window size in minutes, only used with + the `topic` flag. (default: _1 min_) +* `--num_shards`: Number of shards to use when writing windowed elements to + Cloud Storage. Only used with the `topic` flag. (default: _5_) +* `-d, --dry-run`: Preview the load into BigQuery. (default: _disabled_) Invoke with `-h` or `--help` to see the full range of options. -### `weather-mv bigquery` +### BigQuery ``` usage: weather-mv bigquery [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SIZE] [--num_shards NUM_SHARDS] [-d] @@ -61,33 +87,46 @@ usage: weather-mv bigquery [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SI [--coordinate_chunk_size COORDINATE_CHUNK_SIZE] ``` -The `bigquery` subcommand loads weather data into BigQuery. In addition to the common options above, users may specify +The `bigquery` subcommand loads weather data into BigQuery. In addition to the +common options above, users may specify command-specific options: _Command options_: -* `-o, --output_table`: (required) Full name of destination BigQuery table. Ex: my_project.my_dataset.my_table -* `-v, --variables`: Target variables (or coordinates) for the BigQuery schema. Default: will import all data variables +* `-o, --output_table`: (required) Full name of destination BigQuery table. Ex: + my_project.my_dataset.my_table +* `-v, --variables`: Target variables (or coordinates) for the BigQuery schema. + Default: will import all data variables as columns. -* `-a, --area`: Target area in [N, W, S, E]. Default: Will include all available area. -* `--import_time`: When writing data to BigQuery, record that data import occurred at this time +* `-a, --area`: Target area in [N, W, S, E]. Default: Will include all + available area. +* `--import_time`: When writing data to BigQuery, record that data import + occurred at this time (format: YYYY-MM-DD HH:MM:SS.usec+offset). Default: now in UTC. -* `--infer_schema`: Download one file in the URI pattern and infer a schema from that file. Default: off -* `--xarray_open_dataset_kwargs`: Keyword-args to pass into `xarray.open_dataset()` in the form of a JSON string. -* `--coordinate_chunk_size`: The size of the chunk of coordinates used for extracting vector data into BigQuery. Used to +* `--infer_schema`: Download one file in the URI pattern and infer a schema from + that file. Default: off +* `--xarray_open_dataset_kwargs`: Keyword-args to pass + into `xarray.open_dataset()` in the form of a JSON string. +* `--coordinate_chunk_size`: The size of the chunk of coordinates used for + extracting vector data into BigQuery. Used to tune parallel uploads. -* `--tif_metadata_for_datetime` : Metadata that contains tif file's timestamp. Applicable only for tif files. -* `-s, --skip-region-validation` : Skip validation of regions for data migration. Default: off. -* `--disable_grib_schema_normalization` : To disable grib's schema normalization. Default: off. +* `--tif_metadata_for_datetime` : Metadata that contains tif file's timestamp. + Applicable only for tif files. +* `-s, --skip-region-validation` : Skip validation of regions for data + migration. Default: off. +* `--disable_grib_schema_normalization` : To disable grib's schema + normalization. Default: off. Invoke with `bq -h` or `bigquery --help` to see the full range of options. -> Note: In case of grib files, by default its schema will be normalized and the name of the data variables will look +> Note: In case of grib files, by default its schema will be normalized and the +> name of the data variables will look > like `___`. -> -> This solves the issue of skipping over some of the data due to: https://github.com/ecmwf/cfgrib#filter-heterogeneous-grib-files. +> +> This solves the issue of skipping over some of the data due +> to: https://github.com/ecmwf/cfgrib#filter-heterogeneous-grib-files. -_Usage examples_: +_Example:_ ```bash weather-mv bigquery --uris "gs://your-bucket/*.nc" \ @@ -96,81 +135,10 @@ weather-mv bigquery --uris "gs://your-bucket/*.nc" \ --direct_num_workers 2 ``` -Using the subcommand alias `bq`: +For an extensive set of examples, visit [here](Examples.md#bigquery). -```bash -weather-mv bq --uris "gs://your-bucket/*.nc" \ - --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ - --temp_location "gs://$BUCKET/tmp" \ # Needed for batch writes to BigQuery - --direct_num_workers 2 -``` -Preview load with a dry run: - -```bash -weather-mv bq --uris "gs://your-bucket/*.nc" \ - --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ - --temp_location "gs://$BUCKET/tmp" \ # Needed for batch writes to BigQuery - --direct_num_workers 2 \ - --dry-run -``` - -Load COG's (.tif) files: - -```bash -weather-mv bq --uris "gs://your-bucket/*.tif" \ - --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ - --temp_location "gs://$BUCKET/tmp" \ # Needed for batch writes to BigQuery - --direct_num_workers 2 \ - --tif_metadata_for_datetime start_time -``` - -Upload only a subset of variables: - -```bash -weather-mv bq --uris "gs://your-bucket/*.nc" \ - --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ - --variables u10 v10 t - --temp_location "gs://$BUCKET/tmp" \ - --direct_num_workers 2 -``` - -Upload all variables, but for a specific geographic region (for example, the continental US): - -```bash -weather-mv bq --uris "gs://your-bucket/*.nc" \ - --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ - --area 49 -124 24 -66 \ - --temp_location "gs://$BUCKET/tmp" \ - --direct_num_workers 2 -``` - -Control how weather data is opened with XArray: - -```bash -weather-mv bq --uris "gs://your-bucket/*.grib" \ - --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ - --xarray_open_dataset_kwargs '{"engine": "cfgrib", "indexpath": "", "backend_kwargs": {"filter_by_keys": {"typeOfLevel": "surface", "edition": 1}}}' \ - --temp_location "gs://$BUCKET/tmp" \ - --direct_num_workers 2 -``` - -Using DataflowRunner: - -```bash -weather-mv bq --uris "gs://your-bucket/*.nc" \ - --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ - --runner DataflowRunner \ - --project $PROJECT \ - --region $REGION \ - --temp_location "gs://$BUCKET/tmp" \ - --job_name $JOB_NAME -``` - -For a full list of how to configure the Dataflow pipeline, please review -[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options). - -### `weather-mv regrid` +### Regrid ``` usage: weather-mv regrid [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SIZE] [--num_shards NUM_SHARDS] [-d] @@ -179,25 +147,12 @@ usage: weather-mv regrid [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SIZE The `regrid` subcommand makes a regridded copy of the input data with MetView. -To use this capability of the weather mover, please use the `[regrid]` extra when installing: - -```shell -pip install google-weather-tools[regrid] -``` - -> **Warning**: MetView requires a decent amount of disk space in order to perform any regrid operation! Intermediary -> regridding steps will write temporary grib data to disk. Thus, please make use of the `--disk_size_gb` Dataflow -> option. A good rule of thumb would be to consume `30 + 2.5x` GBs of disk, where `x` is the size of each source data -> file. -> -> TODO([#191](https://github.com/google/weather-tools/issues/191)): Find smaller disk space bound. - -In addition to the common options above, users may specify command-specific options: _Command options_: * `-o, --output_path`: (required) The destination path for the regridded files. -* `-k, --regrid_kwargs`: Keyword-args to pass into `metview.regrid()` in the form of a JSON string. Will default to +* `-k, --regrid_kwargs`: Keyword-args to pass into `metview.regrid()` in the + form of a JSON string. Will default to '{"grid": [0.25, 0.25]}'. * `--to_netcdf`: Write output file in NetCDF via XArray. Default: off @@ -206,11 +161,18 @@ consult [this documentation.](https://metview.readthedocs.io/en/latest/metview/u Invoke with `rg -h` or `regrid --help` to see the full range of options. -> Note: Currently, `regrid` doesn't work out-of-the-box! Until [#172](https://github.com/google/weather-tools/issues/172) -> is fixed, users will have to use a workaround in order to ensure MetView is installed in their runner environment -> (instructions are below). +> **Warning**: MetView requires a decent amount of disk space in order to +> perform any regrid operation! Intermediary +> regridding steps will write temporary grib data to disk. Thus, please make use +> of the `--disk_size_gb` Dataflow +> option. A good rule of thumb would be to consume `30 + 2.5x` GBs of disk, +> where `x` is the size of each source data +> file. +> +> TODO([#191](https://github.com/google/weather-tools/issues/191)): Find smaller +> disk space bound. -_Usage examples_: +_Example_: ```bash weather-mv regrid --uris "gs://your-bucket/*.gb" \ @@ -218,82 +180,12 @@ weather-mv regrid --uris "gs://your-bucket/*.gb" \ ``` -Using the subcomand alias 'rg': - -```bash -weather-mv rg --uris "gs://your-bucket/*.gb" \ - --output_path "gs://regrid-bucket/" - -``` - -Preview regrid with a dry run: - -```bash -weather-mv rg --uris "gs://your-bucket/*.gb" \ - --output_path "gs://regrid-bucket/" \ - --dry-run - -``` - -Interpolate to a finer grid resolution: - -```bash -weather-mv rg --uris "gs://your-bucket/*.gb" \ - --output_path "gs://regrid-bucket/" \ - --regrid_kwargs '{"grid": [0.1, 0.1]}'. - -``` - -Interpolate to a high-resolution octahedral gaussian grid: - -```bash -weather-mv rg --uris "gs://your-bucket/*.gb" \ - --output_path "gs://regrid-bucket/" \ - --regrid_kwargs '{"grid": "O1280}'. - -``` - -Convert gribs to NetCDF on copy: - -```bash -weather-mv rg --uris "gs://your-bucket/*.gb" \ - --output_path "gs://regrid-bucket/" \ - --to_netcdf -``` - -Using DataflowRunner: - -```bash -weather-mv rg --uris "gs://your-bucket/*.nc" \ - --output_path "gs://regrid-bucket/" \ - --runner DataflowRunner \ - --project $PROJECT \ - --region $REGION \ - --temp_location "gs://$BUCKET/tmp" \ - --experiment=use_runner_v2 \ - --sdk_container_image="gcr.io/$PROJECT/$REPO:latest" \ - --job_name $JOB_NAME -``` - -Using DataflowRunner, with added disk per VM: - -```bash -weather-mv rg --uris "gs://your-bucket/*.nc" \ - --output_path "gs://regrid-bucket/" \ - --runner DataflowRunner \ - --project $PROJECT \ - --region $REGION \ - --disk_size_gb 250 \ - --temp_location "gs://$BUCKET/tmp" \ - --experiment=use_runner_v2 \ - --sdk_container_image="gcr.io/$PROJECT/$REPO:latest" \ - --job_name $JOB_NAME -``` +For an extensive set of examples, visit [here](Examples.md#regrid). For a full list of how to configure the Dataflow pipeline, please review [this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options). -### `weather-mv earthengine` +### Earth Engine ``` usage: weather-mv earthengine [-h] -i URIS --asset_location ASSET_LOCATION --ee_asset EE_ASSET @@ -303,35 +195,52 @@ usage: weather-mv earthengine [-h] -i URIS --asset_location ASSET_LOCATION --ee_ [--ee_qps EE_QPS] [--ee_latency EE_LATENCY] [--ee_max_concurrent EE_MAX_CONCURRENT] ``` -The `earthengine` subcommand ingests weather data into Earth Engine. In addition to the common options above, -users may specify command-specific options: +The `earthengine` subcommand ingests weather data into Earth Engine. In addition +to the general flags above, users may specify command-specific options: _Command options_: -* `--asset_location`: (required) Bucket location at which asset files will be pushed. -* `--ee_asset`: (required) The asset folder path in earth engine project where the asset files will be pushed. - It should be in format: `projects//assets/`. Make sure that is there - under in earth engine assets. i.e. projects/my-gcp-project/assets/my/foo/bar. -* `--ee_asset_type`: The type of asset to ingest in the earth engine. Default: IMAGE. +* `--asset_location`: (required) Bucket location at which asset files will be + pushed. +* `--ee_asset`: (required) The asset folder path in earth engine project where + the asset files will be pushed. + It should be in format: `projects//assets/`. Make + sure that is there + under in earth engine assets. i.e. + projects/my-gcp-project/assets/my/foo/bar. +* `--ee_asset_type`: The type of asset to ingest in the earth engine. Default: + IMAGE. Supported types are `IMAGE` and `TABLE`.\ `IMAGE`: Uploads georeferenced raster datasets in GeoTIFF format.\ - `TABLE`: Uploads the datsets in the CSV format. Useful in case of point data (sparse data). -* `--disable_grib_schema_normalization`: Restricts merging of grib datasets. Default: False -* `-u, --use_personal_account`: To use personal account for earth engine authentication. -* `--service_account`: Service account address when using a private key for earth engine authentication. -* `--private_key`: To use a private key for earth engine authentication. Only used with the `service_account` flag. -* `--xarray_open_dataset_kwargs`: Keyword-args to pass into `xarray.open_dataset()` in the form of a JSON string. -* `-s, --skip-region-validation` : Skip validation of regions for data migration. Default: off. -* `--ee_qps`: Maximum queries per second allowed by EE for your project. Default: 10. + `TABLE`: Uploads the datsets in the CSV format. Useful in case of point data ( + sparse data). +* `--disable_grib_schema_normalization`: Restricts merging of grib datasets. + Default: False +* `-u, --use_personal_account`: To use personal account for earth engine + authentication. +* `--service_account`: Service account address when using a private key for + earth engine authentication. +* `--private_key`: To use a private key for earth engine authentication. Only + used with the `service_account` flag. +* `--xarray_open_dataset_kwargs`: Keyword-args to pass + into `xarray.open_dataset()` in the form of a JSON string. +* `-s, --skip-region-validation` : Skip validation of regions for data + migration. Default: off. +* `--ee_qps`: Maximum queries per second allowed by EE for your project. + Default: 10. * `--ee_latency`: The expected latency per requests, in seconds. Default: 0.5. -* `--ee_max_concurrent`: Maximum concurrent api requests to EE allowed for your project. Default: 10. -* `--band_names_mapping`: A JSON file which contains the band names for the TIFF file. -* `--initialization_time_regex`: A Regex string to get the initialization time from the filename. -* `--forecast_time_regex`: A Regex string to get the forecast/end time from the filename. +* `--ee_max_concurrent`: Maximum concurrent api requests to EE allowed for your + project. Default: 10. +* `--band_names_mapping`: A JSON file which contains the band names for the TIFF + file. +* `--initialization_time_regex`: A Regex string to get the initialization time + from the filename. +* `--forecast_time_regex`: A Regex string to get the forecast/end time from the + filename. Invoke with `ee -h` or `earthengine --help` to see the full range of options. -_Usage examples_: +_Example_: ```bash weather-mv earthengine --uris "gs://your-bucket/*.grib" \ @@ -339,140 +248,56 @@ weather-mv earthengine --uris "gs://your-bucket/*.grib" \ --ee_asset "projects/$PROJECT/assets/test_dir" ``` -Using the subcommand alias `ee`: +For an extensive set of examples, visit [here](Examples.md#earth-engine). -```bash -weather-mv ee --uris "gs://your-bucket/*.grib" \ - --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib - --ee_asset "projects/$PROJECT/assets/test_dir" -``` +## Configuring Dataflow -Preview ingestion with a dry run: +One of the most powerful features of Apache Beam is supporting a wide range of +execution backends, called [_runners_](https://beam.apache.org/documentation/runners/direct/). +The default [_Direct Runner_](https://beam.apache.org/documentation/runners/direct/) + is a multi-threaded single-node backend which executes the pipeline in the local +machine. In contrast, the [_Dataflow Runner_](https://beam.apache.org/documentation/runners/dataflow/) +deploys the pipeline on [Cloud Dataflow](https://cloud.google.com/dataflow), +a fully-managed Google Cloud service which is highly optimized for Apache Beam +pipelines. -```bash -weather-mv ee --uris "gs://your-bucket/*.grib" \ - --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib - --ee_asset "projects/$PROJECT/assets/test_dir" \ - --dry-run -``` - -Authenticate earth engine using personal account: - -```bash -weather-mv ee --uris "gs://your-bucket/*.grib" \ - --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib - --ee_asset "projects/$PROJECT/assets/test_dir" \ - --use_personal_account -``` - -Authenticate earth engine using a private key: - -```bash -weather-mv ee --uris "gs://your-bucket/*.grib" \ - --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib - --ee_asset "projects/$PROJECT/assets/test_dir" \ - --service_account "my-service-account@...gserviceaccount.com" \ - --private_key "path/to/private_key.json" -``` - -Ingest asset as table in earth engine: - -```bash -weather-mv ee --uris "gs://your-bucket/*.grib" \ - --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib - --ee_asset "projects/$PROJECT/assets/test_dir" \ - --ee_asset_type "TABLE" -``` - -Restrict merging all bands or grib normalization: +Using Dataflow has many benefits including convenient high-performance +integration with Cloud Storage and BigQuery as well as easy scaling. +The runner used for Weather Tools pipelines can be configured using +the `--runner` option (e.g. `--runner DataflowRunner`). +Additional configurations may also be passed to customize the Dataflow +pipeline including resources and autoscaling settings. -```bash -weather-mv ee --uris "gs://your-bucket/*.grib" \ - --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib - --ee_asset "projects/$PROJECT/assets/test_dir" \ - --disable_grib_schema_normalization -``` - -Control how weather data is opened with XArray: - -```bash -weather-mv ee --uris "gs://your-bucket/*.grib" \ - --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib - --ee_asset "projects/$PROJECT/assets/test_dir" - --xarray_open_dataset_kwargs '{"engine": "cfgrib", "indexpath": "", "backend_kwargs": {"filter_by_keys": {"typeOfLevel": "surface", "edition": 1}}}' \ - --temp_location "gs://$BUCKET/tmp" -``` - -Limit EE requests: - -```bash -weather-mv ee --uris "gs://your-bucket/*.grib" \ - --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib - --ee_asset "projects/$PROJECT/assets/test_dir" \ - --ee_qps 10 \ - --ee_latency 0.5 \ - --ee_max_concurrent 10 -``` - -Custom Band names: - -```bash -weather-mv ee --uris "gs://your-bucket/*.tif" \ - --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif - --ee_asset "projects/$PROJECT/assets/test_dir" \ - --band_names_mapping "filename.json" -``` - -Getting initialization and forecast/end date-time from the filename: - -```bash -weather-mv ee --uris "gs://your-bucket/*.tif" \ - --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif - --ee_asset "projects/$PROJECT/assets/test_dir" \ - --initialization_time_regex "$REGEX" \ - --forecast_time_regex "$REGEX" -``` - -Example: - -```bash -weather-mv ee --uris "gs://tmp-gs-bucket/3B-HHR-E_MS_MRG_3IMERG_20220901-S000000-E002959_0000_V06C_30min.tiff" \ - --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif - --ee_asset "projects/$PROJECT/assets/test_dir" \ - --initialization_time_regex "3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S%H%M%S-*tiff" \ - --forecast_time_regex "3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S*-E%H%M%S*tiff" -``` - -Using DataflowRunner: - -```bash -weather-mv ee --uris "gs://your-bucket/*.grib" \ - --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib - --ee_asset "projects/$PROJECT/assets/test_dir" - --runner DataflowRunner \ - --project $PROJECT \ - --region $REGION \ - --temp_location "gs://$BUCKET/tmp" \ - --job_name $JOB_NAME -``` - -For a full list of how to configure the Dataflow pipeline, please review +For a full reference of how to configure the Dataflow pipeline, please review [this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options). ## Streaming ingestion `weather-mv` optionally provides the ability to react -to [Pub/Sub events for objects added to GCS](https://cloud.google.com/storage/docs/pubsub-notifications). This can be -used to automate ingestion into BigQuery as soon as weather data is disseminated. Another common use case it to -automatically create a down-sampled version of a dataset with `regrid`. To set up the Weather Mover with streaming -ingestion, use the `--topic` flag (see "Common options" above). - -Objects that don't match the `--uris` glob pattern will be filtered out of ingestion. This way, a bucket can contain +to [Pub/Sub events for objects added to GCS](https://cloud.google.com/storage/docs/pubsub-notifications). +This can be used to automate ingestion into BigQuery as soon as weather data is +disseminated. Another common use case it to automatically create a down-sampled +version of a dataset with `regrid`. To set up the Weather Mover with streaming +ingestion, use the `--topic` flag (see [General Flags](#general-flags)). + +Objects that don't match the `--uris` glob pattern will be filtered out of +ingestion. This way, a bucket can contain multiple types of data yet only have subsets processed with `weather-mv`. -> It's worth noting: when setting up PubSub, **make sure to create a topic for GCS `OBJECT_FINALIZE` events only.** +> **NOTE:** When setting up PubSub, make sure to create a topic for +**GCS `OBJECT_FINALIZE` events only**. + +> **NOTE:** Data is written into BigQuery using streaming inserts. It may +> take [up to 90 minutes](https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability) +> for buffers to persist into storage. However, weather data will be available for +> querying immediately. -_Usage examples_: +> **NOTE:** It's recommended that you specify variables to +> ingest (`-v, --variables`) instead of inferring the schema for +> streaming pipelines. Not all variables will be distributed with every file, +> especially when they are in Grib format. + +_Examples_: ```shell weather-mv bq --uris "gs://your-bucket/*.nc" \ @@ -510,24 +335,15 @@ weather-mv bq --uris "gs://your-bucket/*.nc" \ --job_name $JOB_NAME ``` -### BigQuery - -Data is written into BigQuery using streaming inserts. It may -take [up to 90 minutes](https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability) -for buffers to persist into storage. However, weather data will be available for querying immediately. - -> Note: It's recommended that you specify variables to ingest (`-v, --variables`) instead of inferring the schema for -> streaming pipelines. Not all variables will be distributed with every file, especially when they are in Grib format. - -## Private Network Configuration +## Known Issues -While running `weather-mv` pipeline in GCP, there is a possibility that you may receive following error - -"Quotas were exceeded: IN_USE_ADDRESSES" +### "Quotas were exceeded: IN_USE_ADDRESSES" -This error occurs when GCP is trying to add new worker-instances and finds that, “Public IP” quota (assigned to your -project) is exhausted. +This error occurs when GCP is trying to add new worker-instances and finds that, +“Public IP” quota (assigned to your project) is exhausted. -To solve this, we recommend using private IP while running your dataflow pipelines. +To solve this, we recommend using private IP while running your dataflow +pipelines. ```shell weather-mv bq --uris "gs://your-bucket/*.nc" \ @@ -541,53 +357,63 @@ weather-mv bq --uris "gs://your-bucket/*.nc" \ --subnetwork=regions/$REGION/subnetworks/$SUBNETWORK ``` -_Common options_: +_Network configuration options_: -* `--no_use_public_ips`: To make Dataflow workers use private IP addresses for all communication, specify the - command-line flag: --no_use_public_ips. Make sure that the specified network or subnetwork has Private Google Access +* `--no_use_public_ips`: To make Dataflow workers use private IP addresses for + all communication, specify the + command-line flag: --no_use_public_ips. Make sure that the specified network + or subnetwork has Private Google Access enabled. -* `--network`: The Compute Engine network for launching Compute Engine instances to run your pipeline. -* `--subnetwork`: The Compute Engine subnetwork for launching Compute Engine instances to run your pipeline. - -For more information regarding how to configure Private IP, please refer -to [Private IP Configuration Guide for Dataflow Pipeline Execution](../Private-IP-Configuration.md) -. - -For more information regarding Pipeline options, please refer -to [pipeline-options](https://cloud.google.com/dataflow/docs/reference/pipeline-options). - -## Custom Dataflow Container for ECMWF dependencies (like MetView) - -It's difficult to install all necessary system dependencies on a Dataflow worker with a pure python solution. For -example, MetView requires binaries to be installed on the system machine, which are broken in the standard debian -install channels (they are only maintained via `conda-forge`). - -Thus, to include such dependencies, we've provided steps for you to build -a [Beam container environment](https://beam.apache.org/documentation/runtime/environments/). In the near future, we'll -arrange things so you don't have to worry about any of these extra -steps ([#172](https://github.com/google/weather-tools/issues/172)). See [these instructions](../Runtime-Container.md) -to learn how to build a custom image for this project. - -Currently, this image is necessary for the `weather-mv regrid` command, but no other commands. To deploy this tool, -please do the following: - -1. Host a container image of the included Dockerfile in your repository of choice (instructions for building images in - GCS are in the next section). -2. Add the following two flags to your regrid pipeline. - ``` - --experiment=use_runner_v2 \ - --sdk_container_image=$CONTAINER_URL - ``` - For example, the full Dataflow command, assuming you follow the next section's instructions, should look like: - - ```bash - weather-mv rg --uris "gs://your-bucket/*.nc" \ - --output_path "gs://regrid-bucket/" \ - --runner DataflowRunner \ - --project $PROJECT \ - --region $REGION \ - --temp_location "gs://$BUCKET/tmp" \ - --experiment=use_runner_v2 \ - --sdk_container_image="gcr.io/$PROJECT/$REPO:latest" - --job_name $JOB_NAME - ``` +* `--network`: The Compute Engine network for launching Compute Engine instances + to run your pipeline. +* `--subnetwork`: The Compute Engine subnetwork for launching Compute Engine + instances to run your pipeline. + +To learn more, visit [Private IP Configuration Guide for Dataflow Pipeline Execution](../Private-IP-Configuration.md). + +For more information regarding Dataflow pipeline options, see the [Configuring Dataflow](#configuring-dataflow) +section above. + +## Dataflow Container Image + +Weather Mover uses 3rd-party binary dependencies including MetView which can +not be reliably set up using pure python solutions. To overcome this, we use +the [Conda-forge](https://conda-forge.org/) distribution of these binaries. +While our [Conda environment](https://github.com/google/weather-tools/blob/main/environment.yml) +is sufficient for local runs, in remote executions like Dataflow, a Docker +container image with the dependencies pre-installed is required. + +By default, our public Docker image +`gcr.io/weather-tools-prod/weather-tools:x.y.z` is used when the pipeline is +executed using _DataflowRunner_. The image is built from [this Dockerfile](https://github.com/google/weather-tools/blob/main/Dockerfile) +and produces the same Conda environment. + +For development purposes, you may use your own customized Docker image. See +[these instructions](../Runtime-Container.md) to learn how to build a custom +image for this project using [Cloud Build](https://cloud.google.com/build). +After building and pushing the custom container image to a container registry of +your choice (like [Cloud Container Registry](https://cloud.google.com/container-registry)), +you can then specify the image used by Dataflow workers for running your +pipeline using these flags: + + ``` + --experiment=use_runner_v2 \ + --sdk_container_image=$CONTAINER_URL + ``` + +_Example:_ + + ```bash + weather-mv rg --uris "gs://your-bucket/*.nc" \ + --output_path "gs://regrid-bucket/" \ + --runner DataflowRunner \ + --project $PROJECT \ + --region $REGION \ + --temp_location "gs://$BUCKET/tmp" \ + --experiment=use_runner_v2 \ + --sdk_container_image="gcr.io/$PROJECT/$REPO:latest" + --job_name $JOB_NAME + ``` + +> **NOTE:** Currently, this image is only necessary for the `weather-mv regrid` +> command as the other commands use pure python dependencies. From 54a76769413c4972bf1b92d1f8c5365b8e58dab9 Mon Sep 17 00:00:00 2001 From: Iman Akbari Date: Thu, 23 Feb 2023 12:32:16 -0500 Subject: [PATCH 2/3] Add default SDK image and make temp_location required --- weather_mv/README.md | 10 ++++---- weather_mv/loader_pipeline/bq.py | 2 ++ weather_mv/loader_pipeline/pipeline.py | 34 ++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/weather_mv/README.md b/weather_mv/README.md index 7d60677f..6efe97ca 100644 --- a/weather_mv/README.md +++ b/weather_mv/README.md @@ -2,14 +2,12 @@ Weather Mover provides an easy and scalable way for: -1. Loading weather datasets - from Cloud Storage into [BigQuery](https://cloud.google.com/bigquery) and +1. Loading weather dataset from Cloud Storage into [BigQuery](https://cloud.google.com/bigquery) and [Google Earth Engine](https://earthengine.google.com/) 2. Fast [interpolation & grid conversions](https://metview.readthedocs.io/en/latest/gen_files/icon_functions/regrid.html) -on large weather datasets + on large weather datasets -It supports popular multi-dimensional data formats including NetCDF, GRIB, -GeoTIFF, and all other formats [supported by Xarray](https://docs.xarray.dev/en/stable/user-guide/io.html). +It supports popular multi-dimensional data formats including NetCDF, GRIB, and GeoTIFF. ## Why do we need this? @@ -95,6 +93,8 @@ _Command options_: * `-o, --output_table`: (required) Full name of destination BigQuery table. Ex: my_project.my_dataset.my_table +* `--temp_location`: (required) GCS directory used for staging temporary files + used in BigQuery table inserts * `-v, --variables`: Target variables (or coordinates) for the BigQuery schema. Default: will import all data variables as columns. diff --git a/weather_mv/loader_pipeline/bq.py b/weather_mv/loader_pipeline/bq.py index d1833df1..0ade6b20 100644 --- a/weather_mv/loader_pipeline/bq.py +++ b/weather_mv/loader_pipeline/bq.py @@ -101,6 +101,8 @@ def add_parser_arguments(cls, subparser: argparse.ArgumentParser): subparser.add_argument('-o', '--output_table', type=str, required=True, help="Full name of destination BigQuery table (..). Table " "will be created if it doesn't exist.") + subparser.add_argument('--temp_location', type=str, required=True, + help="GCS directory used for staging temporary files used in BigQuery table inserts") subparser.add_argument('-v', '--variables', metavar='variables', type=str, nargs='+', default=list(), help='Target variables (or coordinates) for the BigQuery schema. Default: will import ' 'all data variables as columns.') diff --git a/weather_mv/loader_pipeline/pipeline.py b/weather_mv/loader_pipeline/pipeline.py index 856ee45d..1c30a48d 100644 --- a/weather_mv/loader_pipeline/pipeline.py +++ b/weather_mv/loader_pipeline/pipeline.py @@ -28,6 +28,8 @@ logger = logging.getLogger(__name__) +DEFAULT_CONTAINER_IMAGE = "gcr.io/weather-tools-prod/weather-tools:0.0.0" + def configure_logger(verbosity: int) -> None: """Configures logging from verbosity. Default verbosity will show errors.""" @@ -45,6 +47,21 @@ def pattern_to_uris(match_pattern: str, is_zarr: bool = False) -> t.Iterable[str yield from [x.path for x in match.metadata_list] +def _arg_list_includes(args: t.List[str], key: str, val: str = None) -> bool: + """check if a "--key [val]" or "--key=val" argument exists in args list""" + # single valued check (looking for "--key") + if val is None: + for arg in args: + if arg == f"--{key}" or arg.split("=")[0] == f"--{key}": + return True + return False + # double valued check (looking for "--key=value", "--key value") + for idx, arg in enumerate(args[:-1]): + if (arg == f"--{key}" and args[idx + 1] == val) or arg == f"--{key}={val}": + return True + return False + + def pipeline(known_args: argparse.Namespace, pipeline_args: t.List[str]) -> None: all_uris = list(pattern_to_uris(known_args.uris, known_args.zarr)) if not all_uris: @@ -127,8 +144,19 @@ def run(argv: t.List[str]) -> t.Tuple[argparse.Namespace, t.List[str]]: help='Move data into Google EarthEngine') ToEarthEngine.add_parser_arguments(ee_parser) + # print top-level help if zero arguments are passed + if len(argv) == 1: + parser.print_help() + exit(0) + known_args, pipeline_args = parser.parse_known_args(argv[1:]) + # temporary location is shown as a mandatory argument in help messages + # but also needs to be available in pipeline args + if hasattr(known_args, 'temp_location') and known_args.temp_location: + logger.debug("Using temporary location %s", known_args.temp_location) + pipeline_args.extend(['--temp_location', known_args.temp_location]) + configure_logger(2) # 0 = error, 1 = warn, 2 = info, 3 = debug # Validate Zarr arguments @@ -160,4 +188,10 @@ def run(argv: t.List[str]) -> t.Tuple[argparse.Namespace, t.List[str]]: # workflow rely on global context (e.g., a module imported at module level). pipeline_args.extend('--save_main_session true'.split()) + # add default image for Dataflow runner + if _arg_list_includes(pipeline_args, 'runner', 'DataflowRunner') \ + and not _arg_list_includes(pipeline_args, 'sdk_container_image'): + logger.info('Using default Dataflow container image "%s"', DEFAULT_CONTAINER_IMAGE) + pipeline_args.extend(['--sdk_container_image', DEFAULT_CONTAINER_IMAGE]) + return known_args, pipeline_args From 14623bda79e8e8608f1a08600ce00ced4e80af0d Mon Sep 17 00:00:00 2001 From: Iman Akbari Date: Thu, 23 Feb 2023 12:48:50 -0500 Subject: [PATCH 3/3] Fix type hint --- weather_mv/loader_pipeline/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weather_mv/loader_pipeline/pipeline.py b/weather_mv/loader_pipeline/pipeline.py index 1c30a48d..15b1b399 100644 --- a/weather_mv/loader_pipeline/pipeline.py +++ b/weather_mv/loader_pipeline/pipeline.py @@ -47,7 +47,7 @@ def pattern_to_uris(match_pattern: str, is_zarr: bool = False) -> t.Iterable[str yield from [x.path for x in match.metadata_list] -def _arg_list_includes(args: t.List[str], key: str, val: str = None) -> bool: +def _arg_list_includes(args: t.List[str], key: str, val: t.Optional[str] = None) -> bool: """check if a "--key [val]" or "--key=val" argument exists in args list""" # single valued check (looking for "--key") if val is None: