From ee0426e6541c48a14e6cf70af0122fff4c80b06e Mon Sep 17 00:00:00 2001
From: Iman Akbari <imakbari@gmail.com>
Date: Thu, 23 Feb 2023 02:04:12 -0500
Subject: [PATCH 1/3] Revamp weather-mv docs for Docker image release

---
 weather_mv/Examples.md | 272 +++++++++++++++++
 weather_mv/README.md   | 650 +++++++++++++++--------------------------
 2 files changed, 510 insertions(+), 412 deletions(-)
 create mode 100644 weather_mv/Examples.md

diff --git a/weather_mv/Examples.md b/weather_mv/Examples.md
new file mode 100644
index 00000000..29f29573
--- /dev/null
+++ b/weather_mv/Examples.md
@@ -0,0 +1,272 @@
+# Weather Mover Examples
+
+## BigQuery
+
+
+Using the subcommand alias `bq`:
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.nc" \
+           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+           --temp_location "gs://$BUCKET/tmp" \  # Needed for batch writes to BigQuery
+           --direct_num_workers 2
+```
+
+Preview load with a dry run:
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.nc" \
+           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+           --temp_location "gs://$BUCKET/tmp" \  # Needed for batch writes to BigQuery
+           --direct_num_workers 2 \
+           --dry-run
+```
+
+Load COG's (.tif) files:
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.tif" \
+           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+           --temp_location "gs://$BUCKET/tmp" \  # Needed for batch writes to BigQuery
+           --direct_num_workers 2 \
+           --tif_metadata_for_datetime start_time
+```
+
+Upload only a subset of variables:
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.nc" \
+           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+           --variables u10 v10 t
+           --temp_location "gs://$BUCKET/tmp" \
+           --direct_num_workers 2
+```
+
+Upload all variables, but for a specific geographic region (for example, the
+continental US):
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.nc" \
+           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+           --area 49 -124 24 -66 \
+           --temp_location "gs://$BUCKET/tmp" \
+           --direct_num_workers 2
+```
+
+Control how weather data is opened with XArray:
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.grib" \
+           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+           --xarray_open_dataset_kwargs '{"engine": "cfgrib", "indexpath": "", "backend_kwargs": {"filter_by_keys": {"typeOfLevel": "surface", "edition": 1}}}' \
+           --temp_location "gs://$BUCKET/tmp" \
+           --direct_num_workers 2
+```
+
+Using DataflowRunner:
+
+```bash
+weather-mv bq --uris "gs://your-bucket/*.nc" \
+           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
+           --runner DataflowRunner \
+           --project $PROJECT \
+           --region  $REGION \
+           --temp_location "gs://$BUCKET/tmp" \
+           --job_name $JOB_NAME 
+```
+
+For a full list of how to configure the Dataflow pipeline, please review
+[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).
+
+## Regrid
+
+Using the subcomand alias 'rg':
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.gb" \
+           --output_path "gs://regrid-bucket/" 
+ 
+```
+
+Preview regrid with a dry run:
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.gb" \
+           --output_path "gs://regrid-bucket/" \
+           --dry-run
+ 
+```
+
+Interpolate to a finer grid resolution:
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.gb" \
+           --output_path "gs://regrid-bucket/" \
+           --regrid_kwargs '{"grid": [0.1, 0.1]}'.
+ 
+```
+
+Interpolate to a high-resolution octahedral gaussian grid:
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.gb" \
+           --output_path "gs://regrid-bucket/" \
+           --regrid_kwargs '{"grid": "O1280}'.
+ 
+```
+
+Convert gribs to NetCDF on copy:
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.gb" \
+           --output_path "gs://regrid-bucket/" \
+           --to_netcdf
+```
+
+Using DataflowRunner:
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.nc" \
+           --output_path "gs://regrid-bucket/" \
+           --runner DataflowRunner \
+           --project $PROJECT \
+           --region  $REGION \
+           --temp_location "gs://$BUCKET/tmp" \
+           --experiment=use_runner_v2 \
+           --sdk_container_image="gcr.io/$PROJECT/$REPO:latest"  \
+           --job_name $JOB_NAME 
+```
+
+Using DataflowRunner, with added disk per VM:
+
+```bash
+weather-mv rg --uris "gs://your-bucket/*.nc" \
+           --output_path "gs://regrid-bucket/" \
+           --runner DataflowRunner \
+           --project $PROJECT \
+           --region  $REGION \
+           --disk_size_gb 250 \ 
+           --temp_location "gs://$BUCKET/tmp" \
+           --experiment=use_runner_v2 \
+           --sdk_container_image="gcr.io/$PROJECT/$REPO:latest"  \
+           --job_name $JOB_NAME 
+```
+
+## Earth Engine
+
+Using the subcommand alias `ee`:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
+           --ee_asset "projects/$PROJECT/assets/test_dir"
+```
+
+Preview ingestion with a dry run:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
+           --ee_asset "projects/$PROJECT/assets/test_dir" \
+           --dry-run
+```
+
+Authenticate earth engine using personal account:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
+           --ee_asset "projects/$PROJECT/assets/test_dir" \
+           --use_personal_account
+```
+
+Authenticate earth engine using a private key:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
+           --ee_asset "projects/$PROJECT/assets/test_dir" \
+           --service_account "my-service-account@...gserviceaccount.com" \
+           --private_key "path/to/private_key.json"
+```
+
+Ingest asset as table in earth engine:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
+           --ee_asset "projects/$PROJECT/assets/test_dir" \
+           --ee_asset_type "TABLE"
+```
+
+Restrict merging all bands or grib normalization:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
+           --ee_asset "projects/$PROJECT/assets/test_dir" \
+           --disable_grib_schema_normalization
+```
+
+Control how weather data is opened with XArray:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
+           --ee_asset "projects/$PROJECT/assets/test_dir"
+           --xarray_open_dataset_kwargs '{"engine": "cfgrib", "indexpath": "", "backend_kwargs": {"filter_by_keys": {"typeOfLevel": "surface", "edition": 1}}}' \
+           --temp_location "gs://$BUCKET/tmp"
+```
+
+Limit EE requests:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
+           --ee_asset "projects/$PROJECT/assets/test_dir" \
+           --ee_qps 10 \
+           --ee_latency 0.5 \
+           --ee_max_concurrent 10
+```
+
+Custom Band names:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.tif" \
+           --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif
+           --ee_asset "projects/$PROJECT/assets/test_dir" \
+           --band_names_mapping "filename.json"
+```
+
+Getting initialization and forecast/end date-time from the filename:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.tif" \
+           --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif
+           --ee_asset "projects/$PROJECT/assets/test_dir" \
+           --initialization_time_regex "$REGEX" \
+           --forecast_time_regex "$REGEX"
+```
+
+Example:
+
+```bash
+weather-mv ee --uris "gs://tmp-gs-bucket/3B-HHR-E_MS_MRG_3IMERG_20220901-S000000-E002959_0000_V06C_30min.tiff" \
+           --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif
+           --ee_asset "projects/$PROJECT/assets/test_dir" \
+           --initialization_time_regex "3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S%H%M%S-*tiff" \
+           --forecast_time_regex "3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S*-E%H%M%S*tiff"
+```
+
+Using DataflowRunner:
+
+```bash
+weather-mv ee --uris "gs://your-bucket/*.grib" \
+           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
+           --ee_asset "projects/$PROJECT/assets/test_dir"
+           --runner DataflowRunner \
+           --project $PROJECT \
+           --region  $REGION \
+           --temp_location "gs://$BUCKET/tmp" \
+           --job_name $JOB_NAME
+```
diff --git a/weather_mv/README.md b/weather_mv/README.md
index b2e3dd91..7d60677f 100644
--- a/weather_mv/README.md
+++ b/weather_mv/README.md
@@ -1,27 +1,47 @@
-# ⛅️ `weather-mv` – Weather Mover
-
-Weather Mover loads weather data from cloud storage into analytics engines,
-like [Google BigQuery](https://cloud.google.com/bigquery) (_alpha_).
-
-## Features
-
-* **Rapid Querability**: After geospatial data is in BigQuery, data wranging becomes as simple as writing SQL. This
-  allows for rapid data exploration, visualization, and model pipeline prototyping.
-* **Simple Versioning**: All rows in the table come with a `data_import_time` column. This provides some notion of how
-  the data is versioned. Downstream analysis can adapt to data ingested at differen times by updating a `WHERE` clause.
-* **Parallel Upload**: Each file will be processed in parallel. With Dataflow autoscaling, even large datasets can be
-  processed in a reasonable amount of time.
-* **Streaming support**: When running the mover in streaming mode, it will automatically process files as they appear in
-  cloud buckets via PubSub.
-* _(new)_ **Grib Regridding**: `weather-mv regrid` uses [MetView](https://metview.readthedocs.io/en/latest/) to
+# ⛅️ `weather-mv` – Weather Mover <sub><sup>_(alpha)_</sup></sub>
+
+Weather Mover provides an easy and scalable way for:
+
+1. Loading weather datasets 
+   from Cloud Storage into [BigQuery](https://cloud.google.com/bigquery) and
+   [Google Earth Engine](https://earthengine.google.com/)
+2. Fast [interpolation & grid conversions](https://metview.readthedocs.io/en/latest/gen_files/icon_functions/regrid.html)
+on large weather datasets
+
+It supports popular multi-dimensional data formats including NetCDF, GRIB, 
+GeoTIFF, and all other formats [supported by Xarray](https://docs.xarray.dev/en/stable/user-guide/io.html).
+
+## Why do we need this?
+
+* **Upload Speed**: Files in a weather dataset will be processed in parallel.
+  With [Dataflow autoscaling](https://cloud.google.com/dataflow/docs/horizontal-autoscaling), 
+  even large datasets can be transferred to analytical engines in a
+  reasonable amount of time.
+* **Rapid Querying**: After your geospatial data is in BigQuery, data wrangling
+  becomes as simple as writing SQL. This
+  allows for rapid data exploration, visualization, and model pipeline
+  prototyping.
+* **Simple Versioning**: All rows in the destination BigQuery table come with
+  a `data_import_time`
+  column. This provides some notion of how
+  the data is versioned. Downstream analysis can adapt to data ingested at
+  different times by updating a `WHERE` clause.
+* **Streaming Support**: Weather Mover supports a streaming mode where it
+  automatically processes files as they appear in cloud buckets via PubSub.
+* **Scalable Regridding**: The regrid option
+  uses [MetView](https://metview.readthedocs.io/en/latest/) to
   interpolate Grib files to a
   [range of grids.](https://metview.readthedocs.io/en/latest/metview/using_metview/regrid_intro.html?highlight=grid#grid)
-* _(new)_ **Earth Engine Ingestion**: `weather-mv earthengine` ingests weather data into [Google Earth Engine](https://earthengine.google.com/).
+  To learn more about its use cases,
+  visit [MetView docs](https://metview.readthedocs.io/en/latest/metview/using_metview/regrid_intro.html#regrid-explained).
+* **Earth Engine**: By ingesting weather data into [Google Earth Engine](https://earthengine.google.com/),
+  research teams can use its powerful [APIs and App Development interface](https://developers.google.com/earth-engine)
+  for visualization and analysis of geospatial data.
 
 ## Usage
 
 ```
-usage: weather-mv [-h] {bigquery,bq,regrid,rg} ...
+usage: weather-mv [-h] {bigquery,bq,regrid,rg,earthengine,ee} ...
 
 Weather Mover loads weather data from cloud storage into analytics engines.
 
@@ -36,21 +56,27 @@ optional arguments:
   -h, --help            show this help message and exit
 ```
 
-The weather mover makes use of subcommands to distinguish between tasks. The above tasks are currently supported.
+The three main subcommands are:
 
-_Common options_
+* `bigquery` or `bq`: Ingest data from Cloud Storage into BigQuery
+* `earthengine` or `ee`: Ingest data from Cloud Storage into Earth Engine
+* `regrid` or `rg`: Regrid data in Cloud Storage using MetView
 
-* `-i, --uris`: (required) URI glob pattern matching input weather data, e.g. 'gs://ecmwf/era5/era5-2015-*.gb'.
-* `--topic`: A Pub/Sub topic for GCS OBJECT_FINALIZE events, or equivalent, of a cloud bucket. E.g.
-  'projects/<PROJECT_ID>/topics/<TOPIC_ID>'.
-* `--window_size`: Output file's window size in minutes. Only used with the `topic` flag. Default: 1.0 minute.
-* `--num_shards`: Number of shards to use when writing windowed elements to cloud storage. Only used with the `topic`
-  flag. Default: 5 shards.
-* `-d, --dry-run`: Preview the load into BigQuery. Default: off.
+### General Flags
+
+* `-i, --uris`: (required) URI glob pattern matching input weather data, e.g. `
+  gs://ecmwf/era5/era5-2015-*.gb`.
+* `--topic`: A Pub/Sub topic for GCS `OBJECT_FINALIZE` [events](https://cloud.google.com/storage/docs/pubsub-notifications#events), 
+   e.g. `projects/<PROJECT_ID>/topics/<TOPIC_ID>`.
+* `--window_size`: Output file's window size in minutes, only used with
+  the `topic` flag. (default: _1 min_)
+* `--num_shards`: Number of shards to use when writing windowed elements to
+  Cloud Storage. Only used with the `topic` flag. (default: _5_)
+* `-d, --dry-run`: Preview the load into BigQuery. (default: _disabled_)
 
 Invoke with `-h` or `--help` to see the full range of options.
 
-### `weather-mv bigquery`
+### BigQuery
 
 ```
 usage: weather-mv bigquery [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SIZE] [--num_shards NUM_SHARDS] [-d]
@@ -61,33 +87,46 @@ usage: weather-mv bigquery [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SI
                            [--coordinate_chunk_size COORDINATE_CHUNK_SIZE]
 ```
 
-The `bigquery` subcommand loads weather data into BigQuery. In addition to the common options above, users may specify
+The `bigquery` subcommand loads weather data into BigQuery. In addition to the
+common options above, users may specify
 command-specific options:
 
 _Command options_:
 
-* `-o, --output_table`: (required) Full name of destination BigQuery table. Ex: my_project.my_dataset.my_table
-* `-v, --variables`:  Target variables (or coordinates) for the BigQuery schema. Default: will import all data variables
+* `-o, --output_table`: (required) Full name of destination BigQuery table. Ex:
+  my_project.my_dataset.my_table
+* `-v, --variables`:  Target variables (or coordinates) for the BigQuery schema.
+  Default: will import all data variables
   as columns.
-* `-a, --area`:  Target area in [N, W, S, E]. Default: Will include all available area.
-* `--import_time`: When writing data to BigQuery, record that data import occurred at this time
+* `-a, --area`:  Target area in [N, W, S, E]. Default: Will include all
+  available area.
+* `--import_time`: When writing data to BigQuery, record that data import
+  occurred at this time
   (format: YYYY-MM-DD HH:MM:SS.usec+offset). Default: now in UTC.
-* `--infer_schema`: Download one file in the URI pattern and infer a schema from that file. Default: off
-* `--xarray_open_dataset_kwargs`: Keyword-args to pass into `xarray.open_dataset()` in the form of a JSON string.
-* `--coordinate_chunk_size`: The size of the chunk of coordinates used for extracting vector data into BigQuery. Used to
+* `--infer_schema`: Download one file in the URI pattern and infer a schema from
+  that file. Default: off
+* `--xarray_open_dataset_kwargs`: Keyword-args to pass
+  into `xarray.open_dataset()` in the form of a JSON string.
+* `--coordinate_chunk_size`: The size of the chunk of coordinates used for
+  extracting vector data into BigQuery. Used to
   tune parallel uploads.
-* `--tif_metadata_for_datetime` : Metadata that contains tif file's timestamp. Applicable only for tif files.
-* `-s, --skip-region-validation` : Skip validation of regions for data migration. Default: off.
-* `--disable_grib_schema_normalization` : To disable grib's schema normalization. Default: off.
+* `--tif_metadata_for_datetime` : Metadata that contains tif file's timestamp.
+  Applicable only for tif files.
+* `-s, --skip-region-validation` : Skip validation of regions for data
+  migration. Default: off.
+* `--disable_grib_schema_normalization` : To disable grib's schema
+  normalization. Default: off.
 
 Invoke with `bq -h` or `bigquery --help` to see the full range of options.
 
-> Note: In case of grib files, by default its schema will be normalized and the name of the data variables will look 
+> Note: In case of grib files, by default its schema will be normalized and the
+> name of the data variables will look
 > like `<level>_<height>_<attrs['GRIB_stepType']>_<key>`.
-> 
-> This solves the issue of skipping over some of the data due to: https://github.com/ecmwf/cfgrib#filter-heterogeneous-grib-files.
+>
+> This solves the issue of skipping over some of the data due
+> to: https://github.com/ecmwf/cfgrib#filter-heterogeneous-grib-files.
 
-_Usage examples_:
+_Example:_
 
 ```bash
 weather-mv bigquery --uris "gs://your-bucket/*.nc" \
@@ -96,81 +135,10 @@ weather-mv bigquery --uris "gs://your-bucket/*.nc" \
            --direct_num_workers 2
 ```
 
-Using the subcommand alias `bq`:
+For an extensive set of examples, visit [here](Examples.md#bigquery).
 
-```bash
-weather-mv bq --uris "gs://your-bucket/*.nc" \
-           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
-           --temp_location "gs://$BUCKET/tmp" \  # Needed for batch writes to BigQuery
-           --direct_num_workers 2
-```
 
-Preview load with a dry run:
-
-```bash
-weather-mv bq --uris "gs://your-bucket/*.nc" \
-           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
-           --temp_location "gs://$BUCKET/tmp" \  # Needed for batch writes to BigQuery
-           --direct_num_workers 2 \
-           --dry-run
-```
-
-Load COG's (.tif) files:
-
-```bash
-weather-mv bq --uris "gs://your-bucket/*.tif" \
-           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
-           --temp_location "gs://$BUCKET/tmp" \  # Needed for batch writes to BigQuery
-           --direct_num_workers 2 \
-           --tif_metadata_for_datetime start_time
-```
-
-Upload only a subset of variables:
-
-```bash
-weather-mv bq --uris "gs://your-bucket/*.nc" \
-           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
-           --variables u10 v10 t
-           --temp_location "gs://$BUCKET/tmp" \
-           --direct_num_workers 2
-```
-
-Upload all variables, but for a specific geographic region (for example, the continental US):
-
-```bash
-weather-mv bq --uris "gs://your-bucket/*.nc" \
-           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
-           --area 49 -124 24 -66 \
-           --temp_location "gs://$BUCKET/tmp" \
-           --direct_num_workers 2
-```
-
-Control how weather data is opened with XArray:
-
-```bash
-weather-mv bq --uris "gs://your-bucket/*.grib" \
-           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
-           --xarray_open_dataset_kwargs '{"engine": "cfgrib", "indexpath": "", "backend_kwargs": {"filter_by_keys": {"typeOfLevel": "surface", "edition": 1}}}' \
-           --temp_location "gs://$BUCKET/tmp" \
-           --direct_num_workers 2
-```
-
-Using DataflowRunner:
-
-```bash
-weather-mv bq --uris "gs://your-bucket/*.nc" \
-           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \
-           --runner DataflowRunner \
-           --project $PROJECT \
-           --region  $REGION \
-           --temp_location "gs://$BUCKET/tmp" \
-           --job_name $JOB_NAME 
-```
-
-For a full list of how to configure the Dataflow pipeline, please review
-[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).
-
-### `weather-mv regrid`
+### Regrid
 
 ```
 usage: weather-mv regrid [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SIZE] [--num_shards NUM_SHARDS] [-d]
@@ -179,25 +147,12 @@ usage: weather-mv regrid [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SIZE
 
 The `regrid` subcommand makes a regridded copy of the input data with MetView.
 
-To use this capability of the weather mover, please use the `[regrid]` extra when installing:
-
-```shell
-pip install google-weather-tools[regrid]
-```
-
-> **Warning**: MetView requires a decent amount of disk space in order to perform any regrid operation! Intermediary 
-> regridding steps will write temporary grib data to disk. Thus, please make use of the `--disk_size_gb` Dataflow 
-> option. A good rule of thumb would be to consume `30 + 2.5x` GBs of disk, where `x` is the size of each source data
-> file. 
-> 
-> TODO([#191](https://github.com/google/weather-tools/issues/191)): Find smaller disk space bound.
-
-In addition to the common options above, users may specify command-specific options:
 
 _Command options_:
 
 * `-o, --output_path`: (required) The destination path for the regridded files.
-* `-k, --regrid_kwargs`: Keyword-args to pass into `metview.regrid()` in the form of a JSON string. Will default to
+* `-k, --regrid_kwargs`: Keyword-args to pass into `metview.regrid()` in the
+  form of a JSON string. Will default to
   '{"grid": [0.25, 0.25]}'.
 * `--to_netcdf`: Write output file in NetCDF via XArray. Default: off
 
@@ -206,11 +161,18 @@ consult [this documentation.](https://metview.readthedocs.io/en/latest/metview/u
 
 Invoke with `rg -h` or `regrid --help` to see the full range of options.
 
-> Note: Currently, `regrid` doesn't work out-of-the-box! Until [#172](https://github.com/google/weather-tools/issues/172)
-> is fixed, users will have to use a workaround in order to ensure MetView is installed in their runner environment
-> (instructions are below).
+> **Warning**: MetView requires a decent amount of disk space in order to
+> perform any regrid operation! Intermediary
+> regridding steps will write temporary grib data to disk. Thus, please make use
+> of the `--disk_size_gb` Dataflow
+> option. A good rule of thumb would be to consume `30 + 2.5x` GBs of disk,
+> where `x` is the size of each source data
+> file.
+>
+> TODO([#191](https://github.com/google/weather-tools/issues/191)): Find smaller
+> disk space bound.
 
-_Usage examples_:
+_Example_:
 
 ```bash
 weather-mv regrid --uris "gs://your-bucket/*.gb" \
@@ -218,82 +180,12 @@ weather-mv regrid --uris "gs://your-bucket/*.gb" \
  
 ```
 
-Using the subcomand alias 'rg':
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.gb" \
-           --output_path "gs://regrid-bucket/" 
- 
-```
-
-Preview regrid with a dry run:
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.gb" \
-           --output_path "gs://regrid-bucket/" \
-           --dry-run
- 
-```
-
-Interpolate to a finer grid resolution:
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.gb" \
-           --output_path "gs://regrid-bucket/" \
-           --regrid_kwargs '{"grid": [0.1, 0.1]}'.
- 
-```
-
-Interpolate to a high-resolution octahedral gaussian grid:
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.gb" \
-           --output_path "gs://regrid-bucket/" \
-           --regrid_kwargs '{"grid": "O1280}'.
- 
-```
-
-Convert gribs to NetCDF on copy:
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.gb" \
-           --output_path "gs://regrid-bucket/" \
-           --to_netcdf
-```
-
-Using DataflowRunner:
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.nc" \
-           --output_path "gs://regrid-bucket/" \
-           --runner DataflowRunner \
-           --project $PROJECT \
-           --region  $REGION \
-           --temp_location "gs://$BUCKET/tmp" \
-           --experiment=use_runner_v2 \
-           --sdk_container_image="gcr.io/$PROJECT/$REPO:latest"  \
-           --job_name $JOB_NAME 
-```
-
-Using DataflowRunner, with added disk per VM:
-
-```bash
-weather-mv rg --uris "gs://your-bucket/*.nc" \
-           --output_path "gs://regrid-bucket/" \
-           --runner DataflowRunner \
-           --project $PROJECT \
-           --region  $REGION \
-           --disk_size_gb 250 \ 
-           --temp_location "gs://$BUCKET/tmp" \
-           --experiment=use_runner_v2 \
-           --sdk_container_image="gcr.io/$PROJECT/$REPO:latest"  \
-           --job_name $JOB_NAME 
-```
+For an extensive set of examples, visit [here](Examples.md#regrid).
 
 For a full list of how to configure the Dataflow pipeline, please review
 [this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).
 
-### `weather-mv earthengine`
+### Earth Engine
 
 ```
 usage: weather-mv earthengine [-h] -i URIS --asset_location ASSET_LOCATION --ee_asset EE_ASSET
@@ -303,35 +195,52 @@ usage: weather-mv earthengine [-h] -i URIS --asset_location ASSET_LOCATION --ee_
                            [--ee_qps EE_QPS] [--ee_latency EE_LATENCY] [--ee_max_concurrent EE_MAX_CONCURRENT]
 ```
 
-The `earthengine` subcommand ingests weather data into Earth Engine. In addition to the common options above,
-users may specify command-specific options:
+The `earthengine` subcommand ingests weather data into Earth Engine. In addition
+to the general flags above, users may specify command-specific options:
 
 _Command options_:
 
-* `--asset_location`: (required) Bucket location at which asset files will be pushed.
-* `--ee_asset`: (required) The asset folder path in earth engine project where the asset files will be pushed.
-  It should be in format: `projects/<project-id>/assets/<asset-folder>`. Make sure that <asset-folder> is there
-  under <project-id> in earth engine assets. i.e. projects/my-gcp-project/assets/my/foo/bar.
-* `--ee_asset_type`: The type of asset to ingest in the earth engine. Default: IMAGE.
+* `--asset_location`: (required) Bucket location at which asset files will be
+  pushed.
+* `--ee_asset`: (required) The asset folder path in earth engine project where
+  the asset files will be pushed.
+  It should be in format: `projects/<project-id>/assets/<asset-folder>`. Make
+  sure that <asset-folder> is there
+  under <project-id> in earth engine assets. i.e.
+  projects/my-gcp-project/assets/my/foo/bar.
+* `--ee_asset_type`: The type of asset to ingest in the earth engine. Default:
+  IMAGE.
   Supported types are `IMAGE` and `TABLE`.\
   `IMAGE`: Uploads georeferenced raster datasets in GeoTIFF format.\
-  `TABLE`: Uploads the datsets in the CSV format. Useful in case of point data (sparse data). 
-* `--disable_grib_schema_normalization`:  Restricts merging of grib datasets. Default: False
-* `-u, --use_personal_account`: To use personal account for earth engine authentication.
-* `--service_account`: Service account address when using a private key for earth engine authentication.
-* `--private_key`: To use a private key for earth engine authentication. Only used with the `service_account` flag.
-* `--xarray_open_dataset_kwargs`: Keyword-args to pass into `xarray.open_dataset()` in the form of a JSON string.
-* `-s, --skip-region-validation` : Skip validation of regions for data migration. Default: off.
-* `--ee_qps`: Maximum queries per second allowed by EE for your project. Default: 10.
+  `TABLE`: Uploads the datsets in the CSV format. Useful in case of point data (
+  sparse data).
+* `--disable_grib_schema_normalization`:  Restricts merging of grib datasets.
+  Default: False
+* `-u, --use_personal_account`: To use personal account for earth engine
+  authentication.
+* `--service_account`: Service account address when using a private key for
+  earth engine authentication.
+* `--private_key`: To use a private key for earth engine authentication. Only
+  used with the `service_account` flag.
+* `--xarray_open_dataset_kwargs`: Keyword-args to pass
+  into `xarray.open_dataset()` in the form of a JSON string.
+* `-s, --skip-region-validation` : Skip validation of regions for data
+  migration. Default: off.
+* `--ee_qps`: Maximum queries per second allowed by EE for your project.
+  Default: 10.
 * `--ee_latency`: The expected latency per requests, in seconds. Default: 0.5.
-* `--ee_max_concurrent`: Maximum concurrent api requests to EE allowed for your project. Default: 10.
-* `--band_names_mapping`: A JSON file which contains the band names for the TIFF file.
-* `--initialization_time_regex`: A Regex string to get the initialization time from the filename.
-* `--forecast_time_regex`: A Regex string to get the forecast/end time from the filename.
+* `--ee_max_concurrent`: Maximum concurrent api requests to EE allowed for your
+  project. Default: 10.
+* `--band_names_mapping`: A JSON file which contains the band names for the TIFF
+  file.
+* `--initialization_time_regex`: A Regex string to get the initialization time
+  from the filename.
+* `--forecast_time_regex`: A Regex string to get the forecast/end time from the
+  filename.
 
 Invoke with `ee -h` or `earthengine --help` to see the full range of options.
 
-_Usage examples_:
+_Example_:
 
 ```bash
 weather-mv earthengine --uris "gs://your-bucket/*.grib" \
@@ -339,140 +248,56 @@ weather-mv earthengine --uris "gs://your-bucket/*.grib" \
            --ee_asset "projects/$PROJECT/assets/test_dir"
 ```
 
-Using the subcommand alias `ee`:
+For an extensive set of examples, visit [here](Examples.md#earth-engine).
 
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
-           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
-           --ee_asset "projects/$PROJECT/assets/test_dir"
-```
+## Configuring Dataflow
 
-Preview ingestion with a dry run:
+One of the most powerful features of Apache Beam is supporting a wide range of
+execution backends, called [_runners_](https://beam.apache.org/documentation/runners/direct/).
+The default [_Direct Runner_](https://beam.apache.org/documentation/runners/direct/)
+ is a multi-threaded single-node backend which executes the pipeline in the local 
+machine. In contrast, the [_Dataflow Runner_](https://beam.apache.org/documentation/runners/dataflow/)
+deploys the pipeline on [Cloud Dataflow](https://cloud.google.com/dataflow), 
+a fully-managed Google Cloud service which is highly optimized for Apache Beam
+pipelines. 
 
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
-           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
-           --ee_asset "projects/$PROJECT/assets/test_dir" \
-           --dry-run
-```
-
-Authenticate earth engine using personal account:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
-           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
-           --ee_asset "projects/$PROJECT/assets/test_dir" \
-           --use_personal_account
-```
-
-Authenticate earth engine using a private key:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
-           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
-           --ee_asset "projects/$PROJECT/assets/test_dir" \
-           --service_account "my-service-account@...gserviceaccount.com" \
-           --private_key "path/to/private_key.json"
-```
-
-Ingest asset as table in earth engine:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
-           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
-           --ee_asset "projects/$PROJECT/assets/test_dir" \
-           --ee_asset_type "TABLE"
-```
-
-Restrict merging all bands or grib normalization:
+Using Dataflow has many benefits including convenient high-performance
+integration with Cloud Storage and BigQuery as well as easy scaling.
+The runner used for Weather Tools pipelines can be configured using
+the `--runner` option (e.g. `--runner DataflowRunner`).
+Additional configurations may also be passed to customize the Dataflow
+pipeline including resources and autoscaling settings.
 
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
-           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
-           --ee_asset "projects/$PROJECT/assets/test_dir" \
-           --disable_grib_schema_normalization
-```
-
-Control how weather data is opened with XArray:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
-           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
-           --ee_asset "projects/$PROJECT/assets/test_dir"
-           --xarray_open_dataset_kwargs '{"engine": "cfgrib", "indexpath": "", "backend_kwargs": {"filter_by_keys": {"typeOfLevel": "surface", "edition": 1}}}' \
-           --temp_location "gs://$BUCKET/tmp"
-```
-
-Limit EE requests:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
-           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
-           --ee_asset "projects/$PROJECT/assets/test_dir" \
-           --ee_qps 10 \
-           --ee_latency 0.5 \
-           --ee_max_concurrent 10
-```
-
-Custom Band names:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.tif" \
-           --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif
-           --ee_asset "projects/$PROJECT/assets/test_dir" \
-           --band_names_mapping "filename.json"
-```
-
-Getting initialization and forecast/end date-time from the filename:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.tif" \
-           --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif
-           --ee_asset "projects/$PROJECT/assets/test_dir" \
-           --initialization_time_regex "$REGEX" \
-           --forecast_time_regex "$REGEX"
-```
-
-Example:
-
-```bash
-weather-mv ee --uris "gs://tmp-gs-bucket/3B-HHR-E_MS_MRG_3IMERG_20220901-S000000-E002959_0000_V06C_30min.tiff" \
-           --asset_location "gs://$BUCKET/assets" \ # Needed to store assets generated from *.tif
-           --ee_asset "projects/$PROJECT/assets/test_dir" \
-           --initialization_time_regex "3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S%H%M%S-*tiff" \
-           --forecast_time_regex "3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S*-E%H%M%S*tiff"
-```
-
-Using DataflowRunner:
-
-```bash
-weather-mv ee --uris "gs://your-bucket/*.grib" \
-           --asset_location "gs://$BUCKET/assets" \  # Needed to store assets generated from *.grib
-           --ee_asset "projects/$PROJECT/assets/test_dir"
-           --runner DataflowRunner \
-           --project $PROJECT \
-           --region  $REGION \
-           --temp_location "gs://$BUCKET/tmp" \
-           --job_name $JOB_NAME
-```
-
-For a full list of how to configure the Dataflow pipeline, please review
+For a full reference of how to configure the Dataflow pipeline, please review
 [this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).
 
 ## Streaming ingestion
 
 `weather-mv` optionally provides the ability to react
-to [Pub/Sub events for objects added to GCS](https://cloud.google.com/storage/docs/pubsub-notifications). This can be
-used to automate ingestion into BigQuery as soon as weather data is disseminated. Another common use case it to
-automatically create a down-sampled version of a dataset with `regrid`. To set up the Weather Mover with streaming
-ingestion, use the `--topic` flag (see "Common options" above).
-
-Objects that don't match the `--uris` glob pattern will be filtered out of ingestion. This way, a bucket can contain
+to [Pub/Sub events for objects added to GCS](https://cloud.google.com/storage/docs/pubsub-notifications).
+This can be used to automate ingestion into BigQuery as soon as weather data is
+disseminated. Another common use case it to automatically create a down-sampled
+version of a dataset with `regrid`. To set up the Weather Mover with streaming
+ingestion, use the `--topic` flag (see [General Flags](#general-flags)).
+
+Objects that don't match the `--uris` glob pattern will be filtered out of
+ingestion. This way, a bucket can contain
 multiple types of data yet only have subsets processed with `weather-mv`.
 
-> It's worth noting: when setting up PubSub, **make sure to create a topic for GCS `OBJECT_FINALIZE` events only.**
+> **NOTE:** When setting up PubSub, make sure to create a topic for
+**GCS `OBJECT_FINALIZE` events only**.
+
+> **NOTE:** Data is written into BigQuery using streaming inserts. It may
+> take [up to 90 minutes](https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability)
+> for buffers to persist into storage. However, weather data will be available for
+> querying immediately.
 
-_Usage examples_:
+> **NOTE:** It's recommended that you specify variables to
+> ingest (`-v, --variables`) instead of inferring the schema for
+> streaming pipelines. Not all variables will be distributed with every file,
+> especially when they are in Grib format.
+
+_Examples_:
 
 ```shell
 weather-mv bq --uris "gs://your-bucket/*.nc" \
@@ -510,24 +335,15 @@ weather-mv bq --uris "gs://your-bucket/*.nc" \
            --job_name $JOB_NAME 
 ```
 
-### BigQuery
-
-Data is written into BigQuery using streaming inserts. It may
-take [up to 90 minutes](https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability)
-for buffers to persist into storage. However, weather data will be available for querying immediately.
-
-> Note: It's recommended that you specify variables to ingest (`-v, --variables`) instead of inferring the schema for
-> streaming pipelines. Not all variables will be distributed with every file, especially when they are in Grib format.
-
-## Private Network Configuration
+## Known Issues
 
-While running `weather-mv` pipeline in GCP, there is a possibility that you may receive following error -
-"Quotas were exceeded: IN_USE_ADDRESSES"
+### "Quotas were exceeded: IN_USE_ADDRESSES"
 
-This error occurs when GCP is trying to add new worker-instances and finds that, “Public IP” quota (assigned to your
-project) is exhausted.
+This error occurs when GCP is trying to add new worker-instances and finds that,
+“Public IP” quota (assigned to your project) is exhausted.
 
-To solve this, we recommend using private IP while running your dataflow pipelines.
+To solve this, we recommend using private IP while running your dataflow
+pipelines.
 
 ```shell
 weather-mv bq --uris "gs://your-bucket/*.nc" \
@@ -541,53 +357,63 @@ weather-mv bq --uris "gs://your-bucket/*.nc" \
            --subnetwork=regions/$REGION/subnetworks/$SUBNETWORK
 ```
 
-_Common options_:
+_Network configuration options_:
 
-* `--no_use_public_ips`: To make Dataflow workers use private IP addresses for all communication, specify the
-  command-line flag: --no_use_public_ips. Make sure that the specified network or subnetwork has Private Google Access
+* `--no_use_public_ips`: To make Dataflow workers use private IP addresses for
+  all communication, specify the
+  command-line flag: --no_use_public_ips. Make sure that the specified network
+  or subnetwork has Private Google Access
   enabled.
-* `--network`: The Compute Engine network for launching Compute Engine instances to run your pipeline.
-* `--subnetwork`:  The Compute Engine subnetwork for launching Compute Engine instances to run your pipeline.
-
-For more information regarding how to configure Private IP, please refer
-to [Private IP Configuration Guide for Dataflow Pipeline Execution](../Private-IP-Configuration.md)
-.
-
-For more information regarding Pipeline options, please refer
-to [pipeline-options](https://cloud.google.com/dataflow/docs/reference/pipeline-options).
-
-## Custom Dataflow Container for ECMWF dependencies (like MetView)
-
-It's difficult to install all necessary system dependencies on a Dataflow worker with a pure python solution. For
-example, MetView requires binaries to be installed on the system machine, which are broken in the standard debian
-install channels (they are only maintained via `conda-forge`).
-
-Thus, to include such dependencies, we've provided steps for you to build
-a [Beam container environment](https://beam.apache.org/documentation/runtime/environments/). In the near future, we'll
-arrange things so you don't have to worry about any of these extra
-steps ([#172](https://github.com/google/weather-tools/issues/172)). See [these instructions](../Runtime-Container.md) 
-to learn how to build a custom image for this project.
-
-Currently, this image is necessary for the `weather-mv regrid` command, but no other commands. To deploy this tool,
-please do the following:
-
-1. Host a container image of the included Dockerfile in your repository of choice (instructions for building images in
-   GCS are in the next section).
-2. Add the following two flags to your regrid pipeline.
-   ```
-   --experiment=use_runner_v2 \
-   --sdk_container_image=$CONTAINER_URL
-   ```
-   For example, the full Dataflow command, assuming you follow the next section's instructions, should look like:
-
-   ```bash
-   weather-mv rg --uris "gs://your-bucket/*.nc" \
-              --output_path "gs://regrid-bucket/" \
-              --runner DataflowRunner \
-              --project $PROJECT \
-              --region  $REGION \
-              --temp_location "gs://$BUCKET/tmp" \
-              --experiment=use_runner_v2 \
-              --sdk_container_image="gcr.io/$PROJECT/$REPO:latest"
-              --job_name $JOB_NAME 
-   ```
+* `--network`: The Compute Engine network for launching Compute Engine instances
+  to run your pipeline.
+* `--subnetwork`:  The Compute Engine subnetwork for launching Compute Engine
+  instances to run your pipeline.
+
+To learn more, visit [Private IP Configuration Guide for Dataflow Pipeline Execution](../Private-IP-Configuration.md). 
+
+For more information regarding Dataflow pipeline options, see the [Configuring Dataflow](#configuring-dataflow)
+section above.
+
+## Dataflow Container Image
+
+Weather Mover uses 3rd-party binary dependencies including MetView which can
+not be reliably set up using pure python solutions. To overcome this, we use
+the [Conda-forge](https://conda-forge.org/) distribution of these binaries.
+While our [Conda environment](https://github.com/google/weather-tools/blob/main/environment.yml)
+is sufficient for local runs, in remote executions like Dataflow, a Docker 
+container image with the dependencies pre-installed is required.
+
+By default, our public Docker image 
+`gcr.io/weather-tools-prod/weather-tools:x.y.z` is used when the pipeline is 
+executed using _DataflowRunner_. The image is built from [this Dockerfile](https://github.com/google/weather-tools/blob/main/Dockerfile)
+and produces the same Conda environment.
+
+For development purposes, you may use your own customized Docker image. See
+[these instructions](../Runtime-Container.md) to learn how to build a custom
+image for this project using [Cloud Build](https://cloud.google.com/build).
+After building and pushing the custom container image to a container registry of
+your choice (like [Cloud Container Registry](https://cloud.google.com/container-registry)),
+you can then specify the image used by Dataflow workers for running your 
+pipeline using these flags:
+
+ ```
+ --experiment=use_runner_v2 \
+ --sdk_container_image=$CONTAINER_URL
+ ```
+
+_Example:_
+
+ ```bash
+ weather-mv rg --uris "gs://your-bucket/*.nc" \
+            --output_path "gs://regrid-bucket/" \
+            --runner DataflowRunner \
+            --project $PROJECT \
+            --region  $REGION \
+            --temp_location "gs://$BUCKET/tmp" \
+            --experiment=use_runner_v2 \
+            --sdk_container_image="gcr.io/$PROJECT/$REPO:latest"
+            --job_name $JOB_NAME 
+ ```
+
+> **NOTE:** Currently, this image is only necessary for the `weather-mv regrid`
+> command as the other commands use pure python dependencies.

From 54a76769413c4972bf1b92d1f8c5365b8e58dab9 Mon Sep 17 00:00:00 2001
From: Iman Akbari <imakbari@gmail.com>
Date: Thu, 23 Feb 2023 12:32:16 -0500
Subject: [PATCH 2/3] Add default SDK image and make temp_location required

---
 weather_mv/README.md                   | 10 ++++----
 weather_mv/loader_pipeline/bq.py       |  2 ++
 weather_mv/loader_pipeline/pipeline.py | 34 ++++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/weather_mv/README.md b/weather_mv/README.md
index 7d60677f..6efe97ca 100644
--- a/weather_mv/README.md
+++ b/weather_mv/README.md
@@ -2,14 +2,12 @@
 
 Weather Mover provides an easy and scalable way for:
 
-1. Loading weather datasets 
-   from Cloud Storage into [BigQuery](https://cloud.google.com/bigquery) and
+1. Loading weather dataset from Cloud Storage into [BigQuery](https://cloud.google.com/bigquery) and 
    [Google Earth Engine](https://earthengine.google.com/)
 2. Fast [interpolation & grid conversions](https://metview.readthedocs.io/en/latest/gen_files/icon_functions/regrid.html)
-on large weather datasets
+   on large weather datasets
 
-It supports popular multi-dimensional data formats including NetCDF, GRIB, 
-GeoTIFF, and all other formats [supported by Xarray](https://docs.xarray.dev/en/stable/user-guide/io.html).
+It supports popular multi-dimensional data formats including NetCDF, GRIB, and GeoTIFF.
 
 ## Why do we need this?
 
@@ -95,6 +93,8 @@ _Command options_:
 
 * `-o, --output_table`: (required) Full name of destination BigQuery table. Ex:
   my_project.my_dataset.my_table
+* `--temp_location`: (required) GCS directory used for staging temporary files
+  used in BigQuery table inserts
 * `-v, --variables`:  Target variables (or coordinates) for the BigQuery schema.
   Default: will import all data variables
   as columns.
diff --git a/weather_mv/loader_pipeline/bq.py b/weather_mv/loader_pipeline/bq.py
index d1833df1..0ade6b20 100644
--- a/weather_mv/loader_pipeline/bq.py
+++ b/weather_mv/loader_pipeline/bq.py
@@ -101,6 +101,8 @@ def add_parser_arguments(cls, subparser: argparse.ArgumentParser):
         subparser.add_argument('-o', '--output_table', type=str, required=True,
                                help="Full name of destination BigQuery table (<project>.<dataset>.<table>). Table "
                                     "will be created if it doesn't exist.")
+        subparser.add_argument('--temp_location', type=str, required=True,
+                               help="GCS directory used for staging temporary files used in BigQuery table inserts")
         subparser.add_argument('-v', '--variables', metavar='variables', type=str, nargs='+', default=list(),
                                help='Target variables (or coordinates) for the BigQuery schema. Default: will import '
                                     'all data variables as columns.')
diff --git a/weather_mv/loader_pipeline/pipeline.py b/weather_mv/loader_pipeline/pipeline.py
index 856ee45d..1c30a48d 100644
--- a/weather_mv/loader_pipeline/pipeline.py
+++ b/weather_mv/loader_pipeline/pipeline.py
@@ -28,6 +28,8 @@
 
 logger = logging.getLogger(__name__)
 
+DEFAULT_CONTAINER_IMAGE = "gcr.io/weather-tools-prod/weather-tools:0.0.0"
+
 
 def configure_logger(verbosity: int) -> None:
     """Configures logging from verbosity. Default verbosity will show errors."""
@@ -45,6 +47,21 @@ def pattern_to_uris(match_pattern: str, is_zarr: bool = False) -> t.Iterable[str
         yield from [x.path for x in match.metadata_list]
 
 
+def _arg_list_includes(args: t.List[str], key: str, val: str = None) -> bool:
+    """check if a "--key [val]" or "--key=val" argument exists in args list"""
+    # single valued check (looking for "--key")
+    if val is None:
+        for arg in args:
+            if arg == f"--{key}" or arg.split("=")[0] == f"--{key}":
+                return True
+        return False
+    # double valued check (looking for "--key=value", "--key value")
+    for idx, arg in enumerate(args[:-1]):
+        if (arg == f"--{key}" and args[idx + 1] == val) or arg == f"--{key}={val}":
+            return True
+    return False
+
+
 def pipeline(known_args: argparse.Namespace, pipeline_args: t.List[str]) -> None:
     all_uris = list(pattern_to_uris(known_args.uris, known_args.zarr))
     if not all_uris:
@@ -127,8 +144,19 @@ def run(argv: t.List[str]) -> t.Tuple[argparse.Namespace, t.List[str]]:
                                       help='Move data into Google EarthEngine')
     ToEarthEngine.add_parser_arguments(ee_parser)
 
+    # print top-level help if zero arguments are passed
+    if len(argv) == 1:
+        parser.print_help()
+        exit(0)
+
     known_args, pipeline_args = parser.parse_known_args(argv[1:])
 
+    # temporary location is shown as a mandatory argument in help messages
+    # but also needs to be available in pipeline args
+    if hasattr(known_args, 'temp_location') and known_args.temp_location:
+        logger.debug("Using temporary location %s", known_args.temp_location)
+        pipeline_args.extend(['--temp_location', known_args.temp_location])
+
     configure_logger(2)  # 0 = error, 1 = warn, 2 = info, 3 = debug
 
     # Validate Zarr arguments
@@ -160,4 +188,10 @@ def run(argv: t.List[str]) -> t.Tuple[argparse.Namespace, t.List[str]]:
     # workflow rely on global context (e.g., a module imported at module level).
     pipeline_args.extend('--save_main_session true'.split())
 
+    # add default image for Dataflow runner
+    if _arg_list_includes(pipeline_args, 'runner', 'DataflowRunner') \
+       and not _arg_list_includes(pipeline_args, 'sdk_container_image'):
+        logger.info('Using default Dataflow container image "%s"', DEFAULT_CONTAINER_IMAGE)
+        pipeline_args.extend(['--sdk_container_image', DEFAULT_CONTAINER_IMAGE])
+
     return known_args, pipeline_args

From 14623bda79e8e8608f1a08600ce00ced4e80af0d Mon Sep 17 00:00:00 2001
From: Iman Akbari <imakbari@gmail.com>
Date: Thu, 23 Feb 2023 12:48:50 -0500
Subject: [PATCH 3/3] Fix type hint

---
 weather_mv/loader_pipeline/pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/weather_mv/loader_pipeline/pipeline.py b/weather_mv/loader_pipeline/pipeline.py
index 1c30a48d..15b1b399 100644
--- a/weather_mv/loader_pipeline/pipeline.py
+++ b/weather_mv/loader_pipeline/pipeline.py
@@ -47,7 +47,7 @@ def pattern_to_uris(match_pattern: str, is_zarr: bool = False) -> t.Iterable[str
         yield from [x.path for x in match.metadata_list]
 
 
-def _arg_list_includes(args: t.List[str], key: str, val: str = None) -> bool:
+def _arg_list_includes(args: t.List[str], key: str, val: t.Optional[str] = None) -> bool:
     """check if a "--key [val]" or "--key=val" argument exists in args list"""
     # single valued check (looking for "--key")
     if val is None: