diff --git a/weather_mv/Examples.md b/weather_mv/Examples.md
new file mode 100644
index 00000000..29f29573
--- /dev/null
+++ b/weather_mv/Examples.md
@@ -0,0 +1,272 @@
+# Weather Mover Examples
+
+## BigQuery
+
+
+Using the subcommand alias `bq`:
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.nc" \
+ --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+ --temp_location "gs://$BUCKET/tmp" \ # Needed for batch writes to BigQuery
+ --direct_num_workers 2
+```
+
+Preview load with a dry run:
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.nc" \
+ --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+ --temp_location "gs://$BUCKET/tmp" \ # Needed for batch writes to BigQuery
+ --direct_num_workers 2 \
+ --dry-run
+```
+
+Load COG's (.tif) files:
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.tif" \
+ --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+ --temp_location "gs://$BUCKET/tmp" \ # Needed for batch writes to BigQuery
+ --direct_num_workers 2 \
+ --tif_metadata_for_datetime start_time
+```
+
+Upload only a subset of variables:
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.nc" \
+ --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+ --variables u10 v10 t
+ --temp_location "gs://$BUCKET/tmp" \
+ --direct_num_workers 2
+```
+
+Upload all variables, but for a specific geographic region (for example, the
+continental US):
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.nc" \
+ --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+ --area 49 -124 24 -66 \
+ --temp_location "gs://$BUCKET/tmp" \
+ --direct_num_workers 2
+```
+
+Control how weather data is opened with XArray:
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.grib" \
+ --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+ --xarray_open_dataset_kwargs '{"engine": "cfgrib", "indexpath": "", "backend_kwargs": {"filter_by_keys": {"typeOfLevel": "surface", "edition": 1}}}' \
+ --temp_location "gs://$BUCKET/tmp" \
+ --direct_num_workers 2
+```
+
+Using DataflowRunner:
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.nc" \
+ --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+ --runner DataflowRunner \
+ --project $PROJECT \
+ --region $REGION \
+ --temp_location "gs://$BUCKET/tmp" \
+ --job_name $JOB_NAME
+```
+
+For a full list of how to configure the Dataflow pipeline, please review
+[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).
+
+## Regrid
+
+Using the subcomand alias 'rg':
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.gb" \
+ --output_path "gs://regrid-bucket/"
+
+```
+
+Preview regrid with a dry run:
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.gb" \
+ --output_path "gs://regrid-bucket/" \
+ --dry-run
+
+```
+
+Interpolate to a finer grid resolution:
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.gb" \
+ --output_path "gs://regrid-bucket/" \
+ --regrid_kwargs '{"grid": [0.1, 0.1]}'.
+
+```
+
+Interpolate to a high-resolution octahedral gaussian grid:
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.gb" \
+ --output_path "gs://regrid-bucket/" \
+ --regrid_kwargs '{"grid": "O1280}'.
+
+```
+
+Convert gribs to NetCDF on copy:
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.gb" \
+ --output_path "gs://regrid-bucket/" \
+ --to_netcdf
+```
+
+Using DataflowRunner:
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.nc" \
+ --output_path "gs://regrid-bucket/" \
+ --runner DataflowRunner \
+ --project $PROJECT \
+ --region $REGION \
+ --temp_location "gs://$BUCKET/tmp" \
+ --experiment=use_runner_v2 \
+ --sdk_container_image="gcr.io/$PROJECT/$REPO:latest" \
+ --job_name $JOB_NAME
+```
+
+Using DataflowRunner, with added disk per VM:
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.nc" \
+ --output_path "gs://regrid-bucket/" \
+ --runner DataflowRunner \
+ --project $PROJECT \
+ --region $REGION \
+ --disk_size_gb 250 \
+ --temp_location "gs://$BUCKET/tmp" \
+ --experiment=use_runner_v2 \
+ --sdk_container_image="gcr.io/$PROJECT/$REPO:latest" \
+ --job_name $JOB_NAME
+```
+
+## Earth Engine
+
+Using the subcommand alias `ee`:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+ --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
+ --ee_asset "projects/$PROJECT/assets/test_dir"
+```
+
+Preview ingestion with a dry run:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+ --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
+ --ee_asset "projects/$PROJECT/assets/test_dir" \
+ --dry-run
+```
+
+Authenticate earth engine using personal account:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+ --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
+ --ee_asset "projects/$PROJECT/assets/test_dir" \
+ --use_personal_account
+```
+
+Authenticate earth engine using a private key:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+ --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
+ --ee_asset "projects/$PROJECT/assets/test_dir" \
+ --service_account "my-service-account@...gserviceaccount.com" \
+ --private_key "path/to/private_key.json"
+```
+
+Ingest asset as table in earth engine:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+ --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
+ --ee_asset "projects/$PROJECT/assets/test_dir" \
+ --ee_asset_type "TABLE"
+```
+
+Restrict merging all bands or grib normalization:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+ --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
+ --ee_asset "projects/$PROJECT/assets/test_dir" \
+ --disable_grib_schema_normalization
+```
+
+Control how weather data is opened with XArray:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+ --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
+ --ee_asset "projects/$PROJECT/assets/test_dir"
+ --xarray_open_dataset_kwargs '{"engine": "cfgrib", "indexpath": "", "backend_kwargs": {"filter_by_keys": {"typeOfLevel": "surface", "edition": 1}}}' \
+ --temp_location "gs://$BUCKET/tmp"
+```
+
+Limit EE requests:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+ --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
+ --ee_asset "projects/$PROJECT/assets/test_dir" \
+ --ee_qps 10 \
+ --ee_latency 0.5 \
+ --ee_max_concurrent 10
+```
+
+Custom Band names:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.tif" \
+ --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif
+ --ee_asset "projects/$PROJECT/assets/test_dir" \
+ --band_names_mapping "filename.json"
+```
+
+Getting initialization and forecast/end date-time from the filename:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.tif" \
+ --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif
+ --ee_asset "projects/$PROJECT/assets/test_dir" \
+ --initialization_time_regex "$REGEX" \
+ --forecast_time_regex "$REGEX"
+```
+
+Example:
+
+```bash
+weather-mv ee --uris "gs://tmp-gs-bucket/3B-HHR-E_MS_MRG_3IMERG_20220901-S000000-E002959_0000_V06C_30min.tiff" \
+ --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif
+ --ee_asset "projects/$PROJECT/assets/test_dir" \
+ --initialization_time_regex "3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S%H%M%S-*tiff" \
+ --forecast_time_regex "3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S*-E%H%M%S*tiff"
+```
+
+Using DataflowRunner:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+ --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
+ --ee_asset "projects/$PROJECT/assets/test_dir"
+ --runner DataflowRunner \
+ --project $PROJECT \
+ --region $REGION \
+ --temp_location "gs://$BUCKET/tmp" \
+ --job_name $JOB_NAME
+```
diff --git a/weather_mv/README.md b/weather_mv/README.md
index b2e3dd91..6efe97ca 100644
--- a/weather_mv/README.md
+++ b/weather_mv/README.md
@@ -1,27 +1,45 @@
-# ⛅️ `weather-mv` – Weather Mover
-
-Weather Mover loads weather data from cloud storage into analytics engines,
-like [Google BigQuery](https://cloud.google.com/bigquery) (_alpha_).
-
-## Features
-
-* **Rapid Querability**: After geospatial data is in BigQuery, data wranging becomes as simple as writing SQL. This
- allows for rapid data exploration, visualization, and model pipeline prototyping.
-* **Simple Versioning**: All rows in the table come with a `data_import_time` column. This provides some notion of how
- the data is versioned. Downstream analysis can adapt to data ingested at differen times by updating a `WHERE` clause.
-* **Parallel Upload**: Each file will be processed in parallel. With Dataflow autoscaling, even large datasets can be
- processed in a reasonable amount of time.
-* **Streaming support**: When running the mover in streaming mode, it will automatically process files as they appear in
- cloud buckets via PubSub.
-* _(new)_ **Grib Regridding**: `weather-mv regrid` uses [MetView](https://metview.readthedocs.io/en/latest/) to
+# ⛅️ `weather-mv` – Weather Mover _(alpha)_
+
+Weather Mover provides an easy and scalable way for:
+
+1. Loading weather dataset from Cloud Storage into [BigQuery](https://cloud.google.com/bigquery) and
+ [Google Earth Engine](https://earthengine.google.com/)
+2. Fast [interpolation & grid conversions](https://metview.readthedocs.io/en/latest/gen_files/icon_functions/regrid.html)
+ on large weather datasets
+
+It supports popular multi-dimensional data formats including NetCDF, GRIB, and GeoTIFF.
+
+## Why do we need this?
+
+* **Upload Speed**: Files in a weather dataset will be processed in parallel.
+ With [Dataflow autoscaling](https://cloud.google.com/dataflow/docs/horizontal-autoscaling),
+ even large datasets can be transferred to analytical engines in a
+ reasonable amount of time.
+* **Rapid Querying**: After your geospatial data is in BigQuery, data wrangling
+ becomes as simple as writing SQL. This
+ allows for rapid data exploration, visualization, and model pipeline
+ prototyping.
+* **Simple Versioning**: All rows in the destination BigQuery table come with
+ a `data_import_time`
+ column. This provides some notion of how
+ the data is versioned. Downstream analysis can adapt to data ingested at
+ different times by updating a `WHERE` clause.
+* **Streaming Support**: Weather Mover supports a streaming mode where it
+ automatically processes files as they appear in cloud buckets via PubSub.
+* **Scalable Regridding**: The regrid option
+ uses [MetView](https://metview.readthedocs.io/en/latest/) to
interpolate Grib files to a
[range of grids.](https://metview.readthedocs.io/en/latest/metview/using_metview/regrid_intro.html?highlight=grid#grid)
-* _(new)_ **Earth Engine Ingestion**: `weather-mv earthengine` ingests weather data into [Google Earth Engine](https://earthengine.google.com/).
+ To learn more about its use cases,
+ visit [MetView docs](https://metview.readthedocs.io/en/latest/metview/using_metview/regrid_intro.html#regrid-explained).
+* **Earth Engine**: By ingesting weather data into [Google Earth Engine](https://earthengine.google.com/),
+ research teams can use its powerful [APIs and App Development interface](https://developers.google.com/earth-engine)
+ for visualization and analysis of geospatial data.
## Usage
```
-usage: weather-mv [-h] {bigquery,bq,regrid,rg} ...
+usage: weather-mv [-h] {bigquery,bq,regrid,rg,earthengine,ee} ...
Weather Mover loads weather data from cloud storage into analytics engines.
@@ -36,21 +54,27 @@ optional arguments:
-h, --help show this help message and exit
```
-The weather mover makes use of subcommands to distinguish between tasks. The above tasks are currently supported.
+The three main subcommands are:
-_Common options_
+* `bigquery` or `bq`: Ingest data from Cloud Storage into BigQuery
+* `earthengine` or `ee`: Ingest data from Cloud Storage into Earth Engine
+* `regrid` or `rg`: Regrid data in Cloud Storage using MetView
-* `-i, --uris`: (required) URI glob pattern matching input weather data, e.g. 'gs://ecmwf/era5/era5-2015-*.gb'.
-* `--topic`: A Pub/Sub topic for GCS OBJECT_FINALIZE events, or equivalent, of a cloud bucket. E.g.
- 'projects//topics/'.
-* `--window_size`: Output file's window size in minutes. Only used with the `topic` flag. Default: 1.0 minute.
-* `--num_shards`: Number of shards to use when writing windowed elements to cloud storage. Only used with the `topic`
- flag. Default: 5 shards.
-* `-d, --dry-run`: Preview the load into BigQuery. Default: off.
+### General Flags
+
+* `-i, --uris`: (required) URI glob pattern matching input weather data, e.g. `
+ gs://ecmwf/era5/era5-2015-*.gb`.
+* `--topic`: A Pub/Sub topic for GCS `OBJECT_FINALIZE` [events](https://cloud.google.com/storage/docs/pubsub-notifications#events),
+ e.g. `projects//topics/`.
+* `--window_size`: Output file's window size in minutes, only used with
+ the `topic` flag. (default: _1 min_)
+* `--num_shards`: Number of shards to use when writing windowed elements to
+ Cloud Storage. Only used with the `topic` flag. (default: _5_)
+* `-d, --dry-run`: Preview the load into BigQuery. (default: _disabled_)
Invoke with `-h` or `--help` to see the full range of options.
-### `weather-mv bigquery`
+### BigQuery
```
usage: weather-mv bigquery [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SIZE] [--num_shards NUM_SHARDS] [-d]
@@ -61,33 +85,48 @@ usage: weather-mv bigquery [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SI
[--coordinate_chunk_size COORDINATE_CHUNK_SIZE]
```
-The `bigquery` subcommand loads weather data into BigQuery. In addition to the common options above, users may specify
+The `bigquery` subcommand loads weather data into BigQuery. In addition to the
+common options above, users may specify
command-specific options:
_Command options_:
-* `-o, --output_table`: (required) Full name of destination BigQuery table. Ex: my_project.my_dataset.my_table
-* `-v, --variables`: Target variables (or coordinates) for the BigQuery schema. Default: will import all data variables
+* `-o, --output_table`: (required) Full name of destination BigQuery table. Ex:
+ my_project.my_dataset.my_table
+* `--temp_location`: (required) GCS directory used for staging temporary files
+ used in BigQuery table inserts
+* `-v, --variables`: Target variables (or coordinates) for the BigQuery schema.
+ Default: will import all data variables
as columns.
-* `-a, --area`: Target area in [N, W, S, E]. Default: Will include all available area.
-* `--import_time`: When writing data to BigQuery, record that data import occurred at this time
+* `-a, --area`: Target area in [N, W, S, E]. Default: Will include all
+ available area.
+* `--import_time`: When writing data to BigQuery, record that data import
+ occurred at this time
(format: YYYY-MM-DD HH:MM:SS.usec+offset). Default: now in UTC.
-* `--infer_schema`: Download one file in the URI pattern and infer a schema from that file. Default: off
-* `--xarray_open_dataset_kwargs`: Keyword-args to pass into `xarray.open_dataset()` in the form of a JSON string.
-* `--coordinate_chunk_size`: The size of the chunk of coordinates used for extracting vector data into BigQuery. Used to
+* `--infer_schema`: Download one file in the URI pattern and infer a schema from
+ that file. Default: off
+* `--xarray_open_dataset_kwargs`: Keyword-args to pass
+ into `xarray.open_dataset()` in the form of a JSON string.
+* `--coordinate_chunk_size`: The size of the chunk of coordinates used for
+ extracting vector data into BigQuery. Used to
tune parallel uploads.
-* `--tif_metadata_for_datetime` : Metadata that contains tif file's timestamp. Applicable only for tif files.
-* `-s, --skip-region-validation` : Skip validation of regions for data migration. Default: off.
-* `--disable_grib_schema_normalization` : To disable grib's schema normalization. Default: off.
+* `--tif_metadata_for_datetime` : Metadata that contains tif file's timestamp.
+ Applicable only for tif files.
+* `-s, --skip-region-validation` : Skip validation of regions for data
+ migration. Default: off.
+* `--disable_grib_schema_normalization` : To disable grib's schema
+ normalization. Default: off.
Invoke with `bq -h` or `bigquery --help` to see the full range of options.
-> Note: In case of grib files, by default its schema will be normalized and the name of the data variables will look
+> Note: In case of grib files, by default its schema will be normalized and the
+> name of the data variables will look
> like `___`.
->
-> This solves the issue of skipping over some of the data due to: https://github.com/ecmwf/cfgrib#filter-heterogeneous-grib-files.
+>
+> This solves the issue of skipping over some of the data due
+> to: https://github.com/ecmwf/cfgrib#filter-heterogeneous-grib-files.
-_Usage examples_:
+_Example:_
```bash
weather-mv bigquery --uris "gs://your-bucket/*.nc" \
@@ -96,81 +135,10 @@ weather-mv bigquery --uris "gs://your-bucket/*.nc" \
--direct_num_workers 2
```
-Using the subcommand alias `bq`:
+For an extensive set of examples, visit [here](Examples.md#bigquery).
-```bash
-weather-mv bq --uris "gs://your-bucket/*.nc" \
- --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
- --temp_location "gs://$BUCKET/tmp" \ # Needed for batch writes to BigQuery
- --direct_num_workers 2
-```
-Preview load with a dry run:
-
-```bash
-weather-mv bq --uris "gs://your-bucket/*.nc" \
- --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
- --temp_location "gs://$BUCKET/tmp" \ # Needed for batch writes to BigQuery
- --direct_num_workers 2 \
- --dry-run
-```
-
-Load COG's (.tif) files:
-
-```bash
-weather-mv bq --uris "gs://your-bucket/*.tif" \
- --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
- --temp_location "gs://$BUCKET/tmp" \ # Needed for batch writes to BigQuery
- --direct_num_workers 2 \
- --tif_metadata_for_datetime start_time
-```
-
-Upload only a subset of variables:
-
-```bash
-weather-mv bq --uris "gs://your-bucket/*.nc" \
- --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
- --variables u10 v10 t
- --temp_location "gs://$BUCKET/tmp" \
- --direct_num_workers 2
-```
-
-Upload all variables, but for a specific geographic region (for example, the continental US):
-
-```bash
-weather-mv bq --uris "gs://your-bucket/*.nc" \
- --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
- --area 49 -124 24 -66 \
- --temp_location "gs://$BUCKET/tmp" \
- --direct_num_workers 2
-```
-
-Control how weather data is opened with XArray:
-
-```bash
-weather-mv bq --uris "gs://your-bucket/*.grib" \
- --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
- --xarray_open_dataset_kwargs '{"engine": "cfgrib", "indexpath": "", "backend_kwargs": {"filter_by_keys": {"typeOfLevel": "surface", "edition": 1}}}' \
- --temp_location "gs://$BUCKET/tmp" \
- --direct_num_workers 2
-```
-
-Using DataflowRunner:
-
-```bash
-weather-mv bq --uris "gs://your-bucket/*.nc" \
- --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
- --runner DataflowRunner \
- --project $PROJECT \
- --region $REGION \
- --temp_location "gs://$BUCKET/tmp" \
- --job_name $JOB_NAME
-```
-
-For a full list of how to configure the Dataflow pipeline, please review
-[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).
-
-### `weather-mv regrid`
+### Regrid
```
usage: weather-mv regrid [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SIZE] [--num_shards NUM_SHARDS] [-d]
@@ -179,25 +147,12 @@ usage: weather-mv regrid [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SIZE
The `regrid` subcommand makes a regridded copy of the input data with MetView.
-To use this capability of the weather mover, please use the `[regrid]` extra when installing:
-
-```shell
-pip install google-weather-tools[regrid]
-```
-
-> **Warning**: MetView requires a decent amount of disk space in order to perform any regrid operation! Intermediary
-> regridding steps will write temporary grib data to disk. Thus, please make use of the `--disk_size_gb` Dataflow
-> option. A good rule of thumb would be to consume `30 + 2.5x` GBs of disk, where `x` is the size of each source data
-> file.
->
-> TODO([#191](https://github.com/google/weather-tools/issues/191)): Find smaller disk space bound.
-
-In addition to the common options above, users may specify command-specific options:
_Command options_:
* `-o, --output_path`: (required) The destination path for the regridded files.
-* `-k, --regrid_kwargs`: Keyword-args to pass into `metview.regrid()` in the form of a JSON string. Will default to
+* `-k, --regrid_kwargs`: Keyword-args to pass into `metview.regrid()` in the
+ form of a JSON string. Will default to
'{"grid": [0.25, 0.25]}'.
* `--to_netcdf`: Write output file in NetCDF via XArray. Default: off
@@ -206,11 +161,18 @@ consult [this documentation.](https://metview.readthedocs.io/en/latest/metview/u
Invoke with `rg -h` or `regrid --help` to see the full range of options.
-> Note: Currently, `regrid` doesn't work out-of-the-box! Until [#172](https://github.com/google/weather-tools/issues/172)
-> is fixed, users will have to use a workaround in order to ensure MetView is installed in their runner environment
-> (instructions are below).
+> **Warning**: MetView requires a decent amount of disk space in order to
+> perform any regrid operation! Intermediary
+> regridding steps will write temporary grib data to disk. Thus, please make use
+> of the `--disk_size_gb` Dataflow
+> option. A good rule of thumb would be to consume `30 + 2.5x` GBs of disk,
+> where `x` is the size of each source data
+> file.
+>
+> TODO([#191](https://github.com/google/weather-tools/issues/191)): Find smaller
+> disk space bound.
-_Usage examples_:
+_Example_:
```bash
weather-mv regrid --uris "gs://your-bucket/*.gb" \
@@ -218,82 +180,12 @@ weather-mv regrid --uris "gs://your-bucket/*.gb" \
```
-Using the subcomand alias 'rg':
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.gb" \
- --output_path "gs://regrid-bucket/"
-
-```
-
-Preview regrid with a dry run:
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.gb" \
- --output_path "gs://regrid-bucket/" \
- --dry-run
-
-```
-
-Interpolate to a finer grid resolution:
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.gb" \
- --output_path "gs://regrid-bucket/" \
- --regrid_kwargs '{"grid": [0.1, 0.1]}'.
-
-```
-
-Interpolate to a high-resolution octahedral gaussian grid:
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.gb" \
- --output_path "gs://regrid-bucket/" \
- --regrid_kwargs '{"grid": "O1280}'.
-
-```
-
-Convert gribs to NetCDF on copy:
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.gb" \
- --output_path "gs://regrid-bucket/" \
- --to_netcdf
-```
-
-Using DataflowRunner:
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.nc" \
- --output_path "gs://regrid-bucket/" \
- --runner DataflowRunner \
- --project $PROJECT \
- --region $REGION \
- --temp_location "gs://$BUCKET/tmp" \
- --experiment=use_runner_v2 \
- --sdk_container_image="gcr.io/$PROJECT/$REPO:latest" \
- --job_name $JOB_NAME
-```
-
-Using DataflowRunner, with added disk per VM:
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.nc" \
- --output_path "gs://regrid-bucket/" \
- --runner DataflowRunner \
- --project $PROJECT \
- --region $REGION \
- --disk_size_gb 250 \
- --temp_location "gs://$BUCKET/tmp" \
- --experiment=use_runner_v2 \
- --sdk_container_image="gcr.io/$PROJECT/$REPO:latest" \
- --job_name $JOB_NAME
-```
+For an extensive set of examples, visit [here](Examples.md#regrid).
For a full list of how to configure the Dataflow pipeline, please review
[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).
-### `weather-mv earthengine`
+### Earth Engine
```
usage: weather-mv earthengine [-h] -i URIS --asset_location ASSET_LOCATION --ee_asset EE_ASSET
@@ -303,35 +195,52 @@ usage: weather-mv earthengine [-h] -i URIS --asset_location ASSET_LOCATION --ee_
[--ee_qps EE_QPS] [--ee_latency EE_LATENCY] [--ee_max_concurrent EE_MAX_CONCURRENT]
```
-The `earthengine` subcommand ingests weather data into Earth Engine. In addition to the common options above,
-users may specify command-specific options:
+The `earthengine` subcommand ingests weather data into Earth Engine. In addition
+to the general flags above, users may specify command-specific options:
_Command options_:
-* `--asset_location`: (required) Bucket location at which asset files will be pushed.
-* `--ee_asset`: (required) The asset folder path in earth engine project where the asset files will be pushed.
- It should be in format: `projects//assets/`. Make sure that is there
- under in earth engine assets. i.e. projects/my-gcp-project/assets/my/foo/bar.
-* `--ee_asset_type`: The type of asset to ingest in the earth engine. Default: IMAGE.
+* `--asset_location`: (required) Bucket location at which asset files will be
+ pushed.
+* `--ee_asset`: (required) The asset folder path in earth engine project where
+ the asset files will be pushed.
+ It should be in format: `projects//assets/`. Make
+ sure that is there
+ under in earth engine assets. i.e.
+ projects/my-gcp-project/assets/my/foo/bar.
+* `--ee_asset_type`: The type of asset to ingest in the earth engine. Default:
+ IMAGE.
Supported types are `IMAGE` and `TABLE`.\
`IMAGE`: Uploads georeferenced raster datasets in GeoTIFF format.\
- `TABLE`: Uploads the datsets in the CSV format. Useful in case of point data (sparse data).
-* `--disable_grib_schema_normalization`: Restricts merging of grib datasets. Default: False
-* `-u, --use_personal_account`: To use personal account for earth engine authentication.
-* `--service_account`: Service account address when using a private key for earth engine authentication.
-* `--private_key`: To use a private key for earth engine authentication. Only used with the `service_account` flag.
-* `--xarray_open_dataset_kwargs`: Keyword-args to pass into `xarray.open_dataset()` in the form of a JSON string.
-* `-s, --skip-region-validation` : Skip validation of regions for data migration. Default: off.
-* `--ee_qps`: Maximum queries per second allowed by EE for your project. Default: 10.
+ `TABLE`: Uploads the datsets in the CSV format. Useful in case of point data (
+ sparse data).
+* `--disable_grib_schema_normalization`: Restricts merging of grib datasets.
+ Default: False
+* `-u, --use_personal_account`: To use personal account for earth engine
+ authentication.
+* `--service_account`: Service account address when using a private key for
+ earth engine authentication.
+* `--private_key`: To use a private key for earth engine authentication. Only
+ used with the `service_account` flag.
+* `--xarray_open_dataset_kwargs`: Keyword-args to pass
+ into `xarray.open_dataset()` in the form of a JSON string.
+* `-s, --skip-region-validation` : Skip validation of regions for data
+ migration. Default: off.
+* `--ee_qps`: Maximum queries per second allowed by EE for your project.
+ Default: 10.
* `--ee_latency`: The expected latency per requests, in seconds. Default: 0.5.
-* `--ee_max_concurrent`: Maximum concurrent api requests to EE allowed for your project. Default: 10.
-* `--band_names_mapping`: A JSON file which contains the band names for the TIFF file.
-* `--initialization_time_regex`: A Regex string to get the initialization time from the filename.
-* `--forecast_time_regex`: A Regex string to get the forecast/end time from the filename.
+* `--ee_max_concurrent`: Maximum concurrent api requests to EE allowed for your
+ project. Default: 10.
+* `--band_names_mapping`: A JSON file which contains the band names for the TIFF
+ file.
+* `--initialization_time_regex`: A Regex string to get the initialization time
+ from the filename.
+* `--forecast_time_regex`: A Regex string to get the forecast/end time from the
+ filename.
Invoke with `ee -h` or `earthengine --help` to see the full range of options.
-_Usage examples_:
+_Example_:
```bash
weather-mv earthengine --uris "gs://your-bucket/*.grib" \
@@ -339,140 +248,56 @@ weather-mv earthengine --uris "gs://your-bucket/*.grib" \
--ee_asset "projects/$PROJECT/assets/test_dir"
```
-Using the subcommand alias `ee`:
+For an extensive set of examples, visit [here](Examples.md#earth-engine).
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
- --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
- --ee_asset "projects/$PROJECT/assets/test_dir"
-```
+## Configuring Dataflow
-Preview ingestion with a dry run:
+One of the most powerful features of Apache Beam is supporting a wide range of
+execution backends, called [_runners_](https://beam.apache.org/documentation/runners/direct/).
+The default [_Direct Runner_](https://beam.apache.org/documentation/runners/direct/)
+ is a multi-threaded single-node backend which executes the pipeline in the local
+machine. In contrast, the [_Dataflow Runner_](https://beam.apache.org/documentation/runners/dataflow/)
+deploys the pipeline on [Cloud Dataflow](https://cloud.google.com/dataflow),
+a fully-managed Google Cloud service which is highly optimized for Apache Beam
+pipelines.
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
- --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
- --ee_asset "projects/$PROJECT/assets/test_dir" \
- --dry-run
-```
-
-Authenticate earth engine using personal account:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
- --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
- --ee_asset "projects/$PROJECT/assets/test_dir" \
- --use_personal_account
-```
-
-Authenticate earth engine using a private key:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
- --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
- --ee_asset "projects/$PROJECT/assets/test_dir" \
- --service_account "my-service-account@...gserviceaccount.com" \
- --private_key "path/to/private_key.json"
-```
-
-Ingest asset as table in earth engine:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
- --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
- --ee_asset "projects/$PROJECT/assets/test_dir" \
- --ee_asset_type "TABLE"
-```
-
-Restrict merging all bands or grib normalization:
+Using Dataflow has many benefits including convenient high-performance
+integration with Cloud Storage and BigQuery as well as easy scaling.
+The runner used for Weather Tools pipelines can be configured using
+the `--runner` option (e.g. `--runner DataflowRunner`).
+Additional configurations may also be passed to customize the Dataflow
+pipeline including resources and autoscaling settings.
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
- --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
- --ee_asset "projects/$PROJECT/assets/test_dir" \
- --disable_grib_schema_normalization
-```
-
-Control how weather data is opened with XArray:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
- --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
- --ee_asset "projects/$PROJECT/assets/test_dir"
- --xarray_open_dataset_kwargs '{"engine": "cfgrib", "indexpath": "", "backend_kwargs": {"filter_by_keys": {"typeOfLevel": "surface", "edition": 1}}}' \
- --temp_location "gs://$BUCKET/tmp"
-```
-
-Limit EE requests:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
- --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
- --ee_asset "projects/$PROJECT/assets/test_dir" \
- --ee_qps 10 \
- --ee_latency 0.5 \
- --ee_max_concurrent 10
-```
-
-Custom Band names:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.tif" \
- --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif
- --ee_asset "projects/$PROJECT/assets/test_dir" \
- --band_names_mapping "filename.json"
-```
-
-Getting initialization and forecast/end date-time from the filename:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.tif" \
- --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif
- --ee_asset "projects/$PROJECT/assets/test_dir" \
- --initialization_time_regex "$REGEX" \
- --forecast_time_regex "$REGEX"
-```
-
-Example:
-
-```bash
-weather-mv ee --uris "gs://tmp-gs-bucket/3B-HHR-E_MS_MRG_3IMERG_20220901-S000000-E002959_0000_V06C_30min.tiff" \
- --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif
- --ee_asset "projects/$PROJECT/assets/test_dir" \
- --initialization_time_regex "3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S%H%M%S-*tiff" \
- --forecast_time_regex "3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S*-E%H%M%S*tiff"
-```
-
-Using DataflowRunner:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
- --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.grib
- --ee_asset "projects/$PROJECT/assets/test_dir"
- --runner DataflowRunner \
- --project $PROJECT \
- --region $REGION \
- --temp_location "gs://$BUCKET/tmp" \
- --job_name $JOB_NAME
-```
-
-For a full list of how to configure the Dataflow pipeline, please review
+For a full reference of how to configure the Dataflow pipeline, please review
[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).
## Streaming ingestion
`weather-mv` optionally provides the ability to react
-to [Pub/Sub events for objects added to GCS](https://cloud.google.com/storage/docs/pubsub-notifications). This can be
-used to automate ingestion into BigQuery as soon as weather data is disseminated. Another common use case it to
-automatically create a down-sampled version of a dataset with `regrid`. To set up the Weather Mover with streaming
-ingestion, use the `--topic` flag (see "Common options" above).
-
-Objects that don't match the `--uris` glob pattern will be filtered out of ingestion. This way, a bucket can contain
+to [Pub/Sub events for objects added to GCS](https://cloud.google.com/storage/docs/pubsub-notifications).
+This can be used to automate ingestion into BigQuery as soon as weather data is
+disseminated. Another common use case it to automatically create a down-sampled
+version of a dataset with `regrid`. To set up the Weather Mover with streaming
+ingestion, use the `--topic` flag (see [General Flags](#general-flags)).
+
+Objects that don't match the `--uris` glob pattern will be filtered out of
+ingestion. This way, a bucket can contain
multiple types of data yet only have subsets processed with `weather-mv`.
-> It's worth noting: when setting up PubSub, **make sure to create a topic for GCS `OBJECT_FINALIZE` events only.**
+> **NOTE:** When setting up PubSub, make sure to create a topic for
+**GCS `OBJECT_FINALIZE` events only**.
+
+> **NOTE:** Data is written into BigQuery using streaming inserts. It may
+> take [up to 90 minutes](https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability)
+> for buffers to persist into storage. However, weather data will be available for
+> querying immediately.
-_Usage examples_:
+> **NOTE:** It's recommended that you specify variables to
+> ingest (`-v, --variables`) instead of inferring the schema for
+> streaming pipelines. Not all variables will be distributed with every file,
+> especially when they are in Grib format.
+
+_Examples_:
```shell
weather-mv bq --uris "gs://your-bucket/*.nc" \
@@ -510,24 +335,15 @@ weather-mv bq --uris "gs://your-bucket/*.nc" \
--job_name $JOB_NAME
```
-### BigQuery
-
-Data is written into BigQuery using streaming inserts. It may
-take [up to 90 minutes](https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability)
-for buffers to persist into storage. However, weather data will be available for querying immediately.
-
-> Note: It's recommended that you specify variables to ingest (`-v, --variables`) instead of inferring the schema for
-> streaming pipelines. Not all variables will be distributed with every file, especially when they are in Grib format.
-
-## Private Network Configuration
+## Known Issues
-While running `weather-mv` pipeline in GCP, there is a possibility that you may receive following error -
-"Quotas were exceeded: IN_USE_ADDRESSES"
+### "Quotas were exceeded: IN_USE_ADDRESSES"
-This error occurs when GCP is trying to add new worker-instances and finds that, “Public IP” quota (assigned to your
-project) is exhausted.
+This error occurs when GCP is trying to add new worker-instances and finds that,
+“Public IP” quota (assigned to your project) is exhausted.
-To solve this, we recommend using private IP while running your dataflow pipelines.
+To solve this, we recommend using private IP while running your dataflow
+pipelines.
```shell
weather-mv bq --uris "gs://your-bucket/*.nc" \
@@ -541,53 +357,63 @@ weather-mv bq --uris "gs://your-bucket/*.nc" \
--subnetwork=regions/$REGION/subnetworks/$SUBNETWORK
```
-_Common options_:
+_Network configuration options_:
-* `--no_use_public_ips`: To make Dataflow workers use private IP addresses for all communication, specify the
- command-line flag: --no_use_public_ips. Make sure that the specified network or subnetwork has Private Google Access
+* `--no_use_public_ips`: To make Dataflow workers use private IP addresses for
+ all communication, specify the
+ command-line flag: --no_use_public_ips. Make sure that the specified network
+ or subnetwork has Private Google Access
enabled.
-* `--network`: The Compute Engine network for launching Compute Engine instances to run your pipeline.
-* `--subnetwork`: The Compute Engine subnetwork for launching Compute Engine instances to run your pipeline.
-
-For more information regarding how to configure Private IP, please refer
-to [Private IP Configuration Guide for Dataflow Pipeline Execution](../Private-IP-Configuration.md)
-.
-
-For more information regarding Pipeline options, please refer
-to [pipeline-options](https://cloud.google.com/dataflow/docs/reference/pipeline-options).
-
-## Custom Dataflow Container for ECMWF dependencies (like MetView)
-
-It's difficult to install all necessary system dependencies on a Dataflow worker with a pure python solution. For
-example, MetView requires binaries to be installed on the system machine, which are broken in the standard debian
-install channels (they are only maintained via `conda-forge`).
-
-Thus, to include such dependencies, we've provided steps for you to build
-a [Beam container environment](https://beam.apache.org/documentation/runtime/environments/). In the near future, we'll
-arrange things so you don't have to worry about any of these extra
-steps ([#172](https://github.com/google/weather-tools/issues/172)). See [these instructions](../Runtime-Container.md)
-to learn how to build a custom image for this project.
-
-Currently, this image is necessary for the `weather-mv regrid` command, but no other commands. To deploy this tool,
-please do the following:
-
-1. Host a container image of the included Dockerfile in your repository of choice (instructions for building images in
- GCS are in the next section).
-2. Add the following two flags to your regrid pipeline.
- ```
- --experiment=use_runner_v2 \
- --sdk_container_image=$CONTAINER_URL
- ```
- For example, the full Dataflow command, assuming you follow the next section's instructions, should look like:
-
- ```bash
- weather-mv rg --uris "gs://your-bucket/*.nc" \
- --output_path "gs://regrid-bucket/" \
- --runner DataflowRunner \
- --project $PROJECT \
- --region $REGION \
- --temp_location "gs://$BUCKET/tmp" \
- --experiment=use_runner_v2 \
- --sdk_container_image="gcr.io/$PROJECT/$REPO:latest"
- --job_name $JOB_NAME
- ```
+* `--network`: The Compute Engine network for launching Compute Engine instances
+ to run your pipeline.
+* `--subnetwork`: The Compute Engine subnetwork for launching Compute Engine
+ instances to run your pipeline.
+
+To learn more, visit [Private IP Configuration Guide for Dataflow Pipeline Execution](../Private-IP-Configuration.md).
+
+For more information regarding Dataflow pipeline options, see the [Configuring Dataflow](#configuring-dataflow)
+section above.
+
+## Dataflow Container Image
+
+Weather Mover uses 3rd-party binary dependencies including MetView which can
+not be reliably set up using pure python solutions. To overcome this, we use
+the [Conda-forge](https://conda-forge.org/) distribution of these binaries.
+While our [Conda environment](https://github.com/google/weather-tools/blob/main/environment.yml)
+is sufficient for local runs, in remote executions like Dataflow, a Docker
+container image with the dependencies pre-installed is required.
+
+By default, our public Docker image
+`gcr.io/weather-tools-prod/weather-tools:x.y.z` is used when the pipeline is
+executed using _DataflowRunner_. The image is built from [this Dockerfile](https://github.com/google/weather-tools/blob/main/Dockerfile)
+and produces the same Conda environment.
+
+For development purposes, you may use your own customized Docker image. See
+[these instructions](../Runtime-Container.md) to learn how to build a custom
+image for this project using [Cloud Build](https://cloud.google.com/build).
+After building and pushing the custom container image to a container registry of
+your choice (like [Cloud Container Registry](https://cloud.google.com/container-registry)),
+you can then specify the image used by Dataflow workers for running your
+pipeline using these flags:
+
+ ```
+ --experiment=use_runner_v2 \
+ --sdk_container_image=$CONTAINER_URL
+ ```
+
+_Example:_
+
+ ```bash
+ weather-mv rg --uris "gs://your-bucket/*.nc" \
+ --output_path "gs://regrid-bucket/" \
+ --runner DataflowRunner \
+ --project $PROJECT \
+ --region $REGION \
+ --temp_location "gs://$BUCKET/tmp" \
+ --experiment=use_runner_v2 \
+ --sdk_container_image="gcr.io/$PROJECT/$REPO:latest"
+ --job_name $JOB_NAME
+ ```
+
+> **NOTE:** Currently, this image is only necessary for the `weather-mv regrid`
+> command as the other commands use pure python dependencies.
diff --git a/weather_mv/loader_pipeline/bq.py b/weather_mv/loader_pipeline/bq.py
index d1833df1..0ade6b20 100644
--- a/weather_mv/loader_pipeline/bq.py
+++ b/weather_mv/loader_pipeline/bq.py
@@ -101,6 +101,8 @@ def add_parser_arguments(cls, subparser: argparse.ArgumentParser):
subparser.add_argument('-o', '--output_table', type=str, required=True,
help="Full name of destination BigQuery table (..). Table "
"will be created if it doesn't exist.")
+ subparser.add_argument('--temp_location', type=str, required=True,
+ help="GCS directory used for staging temporary files used in BigQuery table inserts")
subparser.add_argument('-v', '--variables', metavar='variables', type=str, nargs='+', default=list(),
help='Target variables (or coordinates) for the BigQuery schema. Default: will import '
'all data variables as columns.')
diff --git a/weather_mv/loader_pipeline/pipeline.py b/weather_mv/loader_pipeline/pipeline.py
index 856ee45d..15b1b399 100644
--- a/weather_mv/loader_pipeline/pipeline.py
+++ b/weather_mv/loader_pipeline/pipeline.py
@@ -28,6 +28,8 @@
logger = logging.getLogger(__name__)
+DEFAULT_CONTAINER_IMAGE = "gcr.io/weather-tools-prod/weather-tools:0.0.0"
+
def configure_logger(verbosity: int) -> None:
"""Configures logging from verbosity. Default verbosity will show errors."""
@@ -45,6 +47,21 @@ def pattern_to_uris(match_pattern: str, is_zarr: bool = False) -> t.Iterable[str
yield from [x.path for x in match.metadata_list]
+def _arg_list_includes(args: t.List[str], key: str, val: t.Optional[str] = None) -> bool:
+ """check if a "--key [val]" or "--key=val" argument exists in args list"""
+ # single valued check (looking for "--key")
+ if val is None:
+ for arg in args:
+ if arg == f"--{key}" or arg.split("=")[0] == f"--{key}":
+ return True
+ return False
+ # double valued check (looking for "--key=value", "--key value")
+ for idx, arg in enumerate(args[:-1]):
+ if (arg == f"--{key}" and args[idx + 1] == val) or arg == f"--{key}={val}":
+ return True
+ return False
+
+
def pipeline(known_args: argparse.Namespace, pipeline_args: t.List[str]) -> None:
all_uris = list(pattern_to_uris(known_args.uris, known_args.zarr))
if not all_uris:
@@ -127,8 +144,19 @@ def run(argv: t.List[str]) -> t.Tuple[argparse.Namespace, t.List[str]]:
help='Move data into Google EarthEngine')
ToEarthEngine.add_parser_arguments(ee_parser)
+ # print top-level help if zero arguments are passed
+ if len(argv) == 1:
+ parser.print_help()
+ exit(0)
+
known_args, pipeline_args = parser.parse_known_args(argv[1:])
+ # temporary location is shown as a mandatory argument in help messages
+ # but also needs to be available in pipeline args
+ if hasattr(known_args, 'temp_location') and known_args.temp_location:
+ logger.debug("Using temporary location %s", known_args.temp_location)
+ pipeline_args.extend(['--temp_location', known_args.temp_location])
+
configure_logger(2) # 0 = error, 1 = warn, 2 = info, 3 = debug
# Validate Zarr arguments
@@ -160,4 +188,10 @@ def run(argv: t.List[str]) -> t.Tuple[argparse.Namespace, t.List[str]]:
# workflow rely on global context (e.g., a module imported at module level).
pipeline_args.extend('--save_main_session true'.split())
+ # add default image for Dataflow runner
+ if _arg_list_includes(pipeline_args, 'runner', 'DataflowRunner') \
+ and not _arg_list_includes(pipeline_args, 'sdk_container_image'):
+ logger.info('Using default Dataflow container image "%s"', DEFAULT_CONTAINER_IMAGE)
+ pipeline_args.extend(['--sdk_container_image', DEFAULT_CONTAINER_IMAGE])
+
return known_args, pipeline_args