From 794c7e88411d8818a5845d71fbb4a19782c97da0 Mon Sep 17 00:00:00 2001
From: Francis Hanna <hannafra@msu.edu>
Date: Thu, 16 Oct 2025 09:49:54 -0400
Subject: [PATCH 1/6] Natural Gas Upstream update - Addresses issue #260 -
 added ng_model_year parameter to the modelconfig files - added new function
 to generate the ng lci based on the ng_model_year parameter - added
 NG_MODEL_YEARS to globals.py - added the ng_model_year to the config files -
 minor fix in eia_trans_dist_grid_loss.py - important note: modified
 generate_upstream_ng to handle NaN values.     See comments in the code.

---
 electricitylci/eia_trans_dist_grid_loss.py    |   2 +-
 electricitylci/globals.py                     |   2 +
 electricitylci/model_config.py                |  13 +
 electricitylci/modelconfig/ELCI_1_config.yml  |   7 +
 .../modelconfig/ELCI_2020_config.yml          |   7 +
 .../modelconfig/ELCI_2021_config.yml          |   7 +
 .../modelconfig/ELCI_2022_config.yml          |   7 +
 .../modelconfig/ELCI_2023_config.yml          |   8 +-
 .../modelconfig/ELCI_2024_config.yml          |   7 +
 electricitylci/modelconfig/ELCI_2_config.yml  |   7 +
 electricitylci/modelconfig/ELCI_3_config.yml  |   7 +
 electricitylci/natural_gas_upstream.py        | 551 +++++++++++++++++-
 12 files changed, 616 insertions(+), 9 deletions(-)

diff --git a/electricitylci/eia_trans_dist_grid_loss.py b/electricitylci/eia_trans_dist_grid_loss.py
index d93d5c3f..3a7ac4f0 100644
--- a/electricitylci/eia_trans_dist_grid_loss.py
+++ b/electricitylci/eia_trans_dist_grid_loss.py
@@ -82,7 +82,7 @@ def eia_trans_dist_download_extract(year):
     pandas.DataFrame
     """
     # check in case year is passed as an int
-    if isinstance(year,str)
+    if isinstance(year,int):
         year = str(year)
     eia_trans_dist_loss = pd.DataFrame()
     old_path = os.getcwd()
diff --git a/electricitylci/globals.py b/electricitylci/globals.py
index 0c9916d0..acf84a24 100644
--- a/electricitylci/globals.py
+++ b/electricitylci/globals.py
@@ -233,6 +233,8 @@
 RENEWABLE_VINTAGES = [2016, 2020]
 '''list : The valid years for renewable inventories (i.e., 2016 and 2020).'''
 
+NG_MODEL_YEARS = [2016, 2020]
+'''list : The valid years for natural gas model (i.e., 2016 and 2020).'''
 
 ##############################################################################
 # FUNCTIONS
diff --git a/electricitylci/model_config.py b/electricitylci/model_config.py
index 9a052654..cb229187 100644
--- a/electricitylci/model_config.py
+++ b/electricitylci/model_config.py
@@ -17,6 +17,7 @@
 from electricitylci.globals import output_dir
 from electricitylci.globals import COAL_MODEL_YEARS
 from electricitylci.globals import RENEWABLE_VINTAGES
+from electricitylci.globals import NG_MODEL_YEARS
 
 
 ##############################################################################
@@ -141,6 +142,8 @@ class ModelSpecs:
         Absolute path to JSON-LD zip output file.
         File name includes the model name and current time stamp and is
         located by default in the output directory (see globals.py).
+    ng_model_year : int
+        The natural gas model year (e.g., 2016 or 2020).
     """
     def __init__(self, model_specs, model_name):
         """Class initialization.
@@ -201,6 +204,7 @@ def __init__(self, model_specs, model_name):
             f"{output_dir}/{model_name}_jsonld_"
             f"{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
         )
+        self.ng_model_year = model_specs["ng_model_year"]
 
 
 ##############################################################################
@@ -330,9 +334,18 @@ def check_model_specs(model_specs):
         err_str += " or ".join([str(x) for x in COAL_MODEL_YEARS])
         err_str += " not %s!" % model_specs['coal_model_year']
         raise ConfigurationError(err_str)
+    
     if not model_specs['renewable_vintage'] in RENEWABLE_VINTAGES:
         err_str = "The renewable inventory vintage must be one of "
         err_str += " or ".join([str(x) for x in RENEWABLE_VINTAGES])
         err_str += " not %s!" % model_specs['renewable_vintage']
         raise ConfigurationError(err_str)
+    
+    if not model_specs['ng_model_year'] in NG_MODEL_YEARS:
+        err_str = "The natural gas model year must be one of "
+        err_str += " or ".join([str(x) for x in NG_MODEL_YEARS])
+        err_str += " not %s!" % model_specs['ng_model_year']
+        raise ConfigurationError(err_str)
+    
     logging.info("Checks passed!")
+
diff --git a/electricitylci/modelconfig/ELCI_1_config.yml b/electricitylci/modelconfig/ELCI_1_config.yml
index f5699bbb..80d7b3d5 100644
--- a/electricitylci/modelconfig/ELCI_1_config.yml
+++ b/electricitylci/modelconfig/ELCI_1_config.yml
@@ -31,6 +31,13 @@ replace_egrid: true
 # construction LCI.
 coal_model_year: 2020
 
+# NG baseline year
+# this is used to determine which NG inventory to use. 
+# this impacts the upstream ng emissions for production, gathering and boosting,
+# processing, transmission, storage, and distribution
+# Select between 2016 and 2020 model inventories.
+ng_model_year: 2016
+
 # NETL developed profiles for renewable generation to capture construction
 # and O&M impacts (e.g., solar PV manufacturing and power plant operations).
 # There are two vintages for renewable inventories: 2016 and 2020. The 2016
diff --git a/electricitylci/modelconfig/ELCI_2020_config.yml b/electricitylci/modelconfig/ELCI_2020_config.yml
index 36bf05f3..fbd4875e 100644
--- a/electricitylci/modelconfig/ELCI_2020_config.yml
+++ b/electricitylci/modelconfig/ELCI_2020_config.yml
@@ -31,6 +31,13 @@ replace_egrid: true
 # construction LCI.
 coal_model_year: 2023
 
+# NG baseline year
+# this is used to determine which NG inventory to use. 
+# this impacts the upstream ng emissions for production, gathering and boosting,
+# processing, transmission, storage, and distribution
+# Select between 2016 and 2020 model inventories.
+ng_model_year: 2020
+
 # NETL developed profiles for renewable generation to capture construction
 # and O&M impacts (e.g., solar PV manufacturing and power plant operations).
 # There are two vintages for renewable inventories: 2016 and 2020. The 2016
diff --git a/electricitylci/modelconfig/ELCI_2021_config.yml b/electricitylci/modelconfig/ELCI_2021_config.yml
index ea680a96..c39bdc8e 100644
--- a/electricitylci/modelconfig/ELCI_2021_config.yml
+++ b/electricitylci/modelconfig/ELCI_2021_config.yml
@@ -31,6 +31,13 @@ replace_egrid: true
 # construction LCI.
 coal_model_year: 2023
 
+# NG baseline year
+# this is used to determine which NG inventory to use. 
+# this impacts the upstream ng emissions for production, gathering and boosting,
+# processing, transmission, storage, and distribution
+# Select between 2016 and 2020 model inventories.
+ng_model_year: 2020
+
 # NETL developed profiles for renewable generation to capture construction
 # and O&M impacts (e.g., solar PV manufacturing and power plant operations).
 # There are two vintages for renewable inventories: 2016 and 2020. The 2016
diff --git a/electricitylci/modelconfig/ELCI_2022_config.yml b/electricitylci/modelconfig/ELCI_2022_config.yml
index 64e75bc5..94d53947 100644
--- a/electricitylci/modelconfig/ELCI_2022_config.yml
+++ b/electricitylci/modelconfig/ELCI_2022_config.yml
@@ -31,6 +31,13 @@ replace_egrid: true
 # construction LCI.
 coal_model_year: 2023
 
+# NG baseline year
+# this is used to determine which NG inventory to use. 
+# this impacts the upstream ng emissions for production, gathering and boosting,
+# processing, transmission, storage, and distribution
+# Select between 2016 and 2020 model inventories.
+ng_model_year: 2020
+
 # NETL developed profiles for renewable generation to capture construction
 # and O&M impacts (e.g., solar PV manufacturing and power plant operations).
 # There are two vintages for renewable inventories: 2016 and 2020. The 2016
diff --git a/electricitylci/modelconfig/ELCI_2023_config.yml b/electricitylci/modelconfig/ELCI_2023_config.yml
index fc0908d0..4182d437 100644
--- a/electricitylci/modelconfig/ELCI_2023_config.yml
+++ b/electricitylci/modelconfig/ELCI_2023_config.yml
@@ -31,6 +31,13 @@ replace_egrid: true
 # construction LCI.
 coal_model_year: 2023
 
+# NG baseline year
+# this is used to determine which NG inventory to use. 
+# this impacts the upstream ng emissions for production, gathering and boosting,
+# processing, transmission, storage, and distribution
+# Select between 2016 and 2020 model inventories.
+ng_model_year: 2020
+
 # NETL developed profiles for renewable generation to capture construction
 # and O&M impacts (e.g., solar PV manufacturing and power plant operations).
 # There are two vintages for renewable inventories: 2016 and 2020. The 2016
@@ -147,5 +154,4 @@ NETL_IO_trading_year: 2023
 # Product systems for the at-user consumption mixes are also generated.
 run_post_processes: true
 
-
 # OTHER PARAMETERS
diff --git a/electricitylci/modelconfig/ELCI_2024_config.yml b/electricitylci/modelconfig/ELCI_2024_config.yml
index 40e72044..c8caedce 100644
--- a/electricitylci/modelconfig/ELCI_2024_config.yml
+++ b/electricitylci/modelconfig/ELCI_2024_config.yml
@@ -31,6 +31,13 @@ replace_egrid: true
 # construction LCI.
 coal_model_year: 2023
 
+# NG baseline year
+# this is used to determine which NG inventory to use. 
+# this impacts the upstream ng emissions for production, gathering and boosting,
+# processing, transmission, storage, and distribution
+# Select between 2016 and 2020 model inventories.
+ng_model_year: 2020
+
 # NETL developed profiles for renewable generation to capture construction
 # and O&M impacts (e.g., solar PV manufacturing and power plant operations).
 # There are two vintages for renewable inventories: 2016 and 2020. The 2016
diff --git a/electricitylci/modelconfig/ELCI_2_config.yml b/electricitylci/modelconfig/ELCI_2_config.yml
index f3704a90..d0a56093 100644
--- a/electricitylci/modelconfig/ELCI_2_config.yml
+++ b/electricitylci/modelconfig/ELCI_2_config.yml
@@ -31,6 +31,13 @@ replace_egrid: true
 # construction LCI.
 coal_model_year: 2020
 
+# NG baseline year
+# this is used to determine which NG inventory to use. 
+# this impacts the upstream ng emissions for production, gathering and boosting,
+# processing, transmission, storage, and distribution
+# Select between 2016 and 2020 model inventories.
+ng_model_year: 2016
+
 # NETL developed profiles for renewable generation to capture construction
 # and O&M impacts (e.g., solar PV manufacturing and power plant operations).
 # There are two vintages for renewable inventories: 2016 and 2020. The 2016
diff --git a/electricitylci/modelconfig/ELCI_3_config.yml b/electricitylci/modelconfig/ELCI_3_config.yml
index 572f2c61..51332d55 100644
--- a/electricitylci/modelconfig/ELCI_3_config.yml
+++ b/electricitylci/modelconfig/ELCI_3_config.yml
@@ -31,6 +31,13 @@ replace_egrid: false
 # construction LCI.
 coal_model_year: 2020
 
+# NG baseline year
+# this is used to determine which NG inventory to use. 
+# this impacts the upstream ng emissions for production, gathering and boosting,
+# processing, transmission, storage, and distribution
+# Select between 2016 and 2020 model inventories.
+ng_model_year: 2016
+
 # NETL developed profiles for renewable generation to capture construction
 # and O&M impacts (e.g., solar PV manufacturing and power plant operations).
 # There are two vintages for renewable inventories: 2016 and 2020. The 2016
diff --git a/electricitylci/natural_gas_upstream.py b/electricitylci/natural_gas_upstream.py
index d05303b9..b5a1b5bf 100644
--- a/electricitylci/natural_gas_upstream.py
+++ b/electricitylci/natural_gas_upstream.py
@@ -8,6 +8,7 @@
 ##############################################################################
 import logging
 import os
+import sys
 
 import pandas as pd
 
@@ -16,6 +17,8 @@
 import electricitylci.PhysicalQuantities as pq
 from electricitylci.generation import add_temporal_correlation_score
 from electricitylci.model_config import model_specs
+from electricitylci.utils import download_edx
+from electricitylci.globals import paths
 ##############################################################################
 # MODULE DOCUMENTATION
 ##############################################################################
@@ -26,15 +29,72 @@
 Created:
     2019-02-18
 Last updated:
-    2024-01-10
+    2025-10-15
 """
 __all__ = [
     "generate_upstream_ng",
 ]
 
 
+#############################################################################
+# GLOBALS
 ##############################################################################
-# FUNCTIONS
+technobasins_basins = {
+    'Appalachian': ['FI - App Shale'],
+    'Alaska Offshore': ['FI - Alaska Offshore'],
+    'Anadarko': ['FI - Anadarko Conv','FI - Anadarko Shale', 'FI - Anadarko Tight'],
+    'Arkla': ['FI - Arkla Conv','FI - Arkla Shale','FI - Arkla Tight'],
+    'Arkoma': ['FI - Arkoma Conv','FI - Arkoma Shale'],
+    'East Texas': ['FI - East Texas Conv', 'FI - East Texas Shale', 'FI - East Texas Tight'],
+    'Fort Worth': ['FI - Fort Worth Shale'],
+    'Green River': ['FI - Green River Conv', 'FI - Green River Tight'],
+    'Gulf': ['FI - Gulf Conv', 'FI - Gulf Shale', 'FI - Gulf TIght'], ## This not a typo - the title of the sheet in the excel file is 'FI - Gulf TIght'
+    'Permian': ['FI - Permian Conv', 'FI - Permian Shale'],
+    'Piceance': ['FI - Piceance Tight'],
+    'San Juan': ['FI - San Juan CBM', 'FI - San Juan Shale'],
+    'South Oklahoma': ['FI - South OK Shale'],
+    'Strawn': ['FI - Strawn Shale'],
+    'Uinta': ['FI - Uinta Conv', 'FI - Uinta Tight'],
+    'GoM': ['FI - GoM Offshore']
+}   
+
+# Aliases to account for different naming conventions of technobasins used in the excel file
+# the below dictionary is hardcoded
+
+aliases = {
+    'Appalachian Shale': 'FI - App Shale',
+    'Alaska Offshore': 'FI - Alaska Offshore',
+    'GoM Offshore': 'FI - GoM Offshore',
+    'Arkla Shale': 'FI - Arkla Shale',
+    'Arkla Tight': 'FI - Arkla Tight',
+    'Green River Conv': 'FI - Green River Conv',
+    'Green River Tight': 'FI - Green River Tight',
+    'Permian Conv': 'FI - Permian Conv',
+    'Gulf Tight': 'FI - Gulf TIght', ## This not a typo - the title of the sheet in the excel file is 'FI - Gulf TIght'
+    'Uinta Conv': 'FI - Uinta Conv',
+    'Gulf Conv': 'FI - Gulf Conv',
+    'Gulf Shale': 'FI - Gulf Shale',
+    'Permian Shale': 'FI - Permian Shale',
+    'Anadarko Shale': 'FI - Anadarko Shale',
+    'South Oklahoma Shale': 'FI - South OK Shale',
+    'Uinta Tight': 'FI - Uinta Tight',
+    'East Texas Tight': 'FI - East Texas Tight',
+    'East Texas Shale': 'FI - East Texas Shale',
+    'Strawn Shale': 'FI - Strawn Shale',
+    'Piceance Tight': 'FI - Piceance Tight',
+    'Fort Worth Shale': 'FI - Fort Worth Shale',
+    'Arkla Conv': 'FI - Arkla Conv',
+    'East Texas Conv': 'FI - East Texas Conv',
+    'Arkoma Shale': 'FI - Arkoma Shale',
+    'Anadarko Conv': 'FI - Anadarko Conv',
+    'San Juan CBM': 'FI - San Juan CBM',
+    'Anadarko Tight': 'FI - Anadarko Tight',
+    'Arkoma Conv': 'FI - Arkoma Conv',
+    'San Juan Shale': 'FI - San Juan Shale'
+}
+
+##############################################################################
+# MAN FUNCTION
 ##############################################################################
 def generate_upstream_ng(year):
     """
@@ -100,11 +160,12 @@ def generate_upstream_ng(year):
         columns=['Plant Code']
     )
 
-    # Read the NG LCI excel file
-    ng_lci = pd.read_csv(
-        os.path.join(data_dir, "NG_LCI.csv"),
-        index_col=[0,1,2,3,4,5]
-    )
+    # Read the NG LCI file
+    # if year = 2016 - this step will directly ready NG_LCI.csv from the data_dir
+    # if year = 2020 - this step will require edx api, download ng model and mapping 
+    # document from edx, and generate lci
+    ng_lci = get_ng_lci(model_specs.ng_model_year)
+
     ng_lci_columns=[
         "Compartment",
         "FlowName",
@@ -174,8 +235,484 @@ def generate_upstream_ng(year):
     #generation is combined, it needs to be based on the target year for the
     #inventory.
     ng_lci_basin["Year"]=year
+
+    # Issue: the current basin-to-plant mapping document does not include the Alaska Offshore and GoM Offshore basins
+    #        on the other hand, the ng_lci generated above includes emissions for both of there basins
+    #        this causes NaN values in the 'ng_lci_basin' dataframe and then returns errors when converting to int32
+    #        a quick fix involves omitting NaN values from the 'ng_lci_basin' dataframe - but this assumes that Offshore 
+    #        gas production is not used in electricity production
+    #        A fix for the future involves updating the mapping document: 'gas_supply_basin_mapping.csv' to account for 
+    #        offshore gas used in electricity production
+    
+    ng_lci_basin = ng_lci_basin.dropna(subset=['FlowAmount'])
+    
     return ng_lci_basin
 
+##############################################################################
+# HELPER FUNCTIONS
+##############################################################################
+
+def get_ng_lci(year):
+    """
+    Get the natural gas life cycle inventory for a given year.
+    Depending on the year, the natural gas life cycle inventory is either:
+        ** retrieved from existing data
+        ** calculated using the natural gas life cycle inventory model 
+
+    Parameters
+    ----------
+    year : str, int
+        The year for which to get the natural gas life cycle inventory.
+        This is retrieved from the model configuration
+    
+    Returns
+    -------
+    a dataframe containing the emissions associated with the natural gas 
+    production through transportation for each basin during the given year.
+
+    Notes
+    -----
+    This method depends on:
+        ** the configuration parameter: ------------
+        ** the NG_LCI csv file (if the old model is selected in the configuration)
+        ** the EDx API (if the new model is selected in the configuration)
+        ** the elci flow mapping csv file (if the new model is selected in the configuration)
+    """
+    if isinstance(year, int):
+        year = str(year)
+    if year == "2016":
+        logging.info(f"Retrieving the 2016 natural gas life cycle inventory by basin.")
+        ng_lci = pd.read_csv(
+            os.path.join(data_dir, "NG_LCI.csv"),
+            index_col=[0,1,2,3,4,5]
+        )
+    else:
+        data_folder = os.path.join(paths.local_path, 'netl')
+        #check if the ng_lci_2020rev1.csv already exists - if it does then we can skip all the below
+        if os.path.exists(os.path.join(data_folder, "ng_lci_2020rev1.csv")):
+            logging.info(f"NG LCI already exists in your data directory.")
+            ng_lci = pd.read_csv(
+                os.path.join(data_folder, "ng_lci_2020rev1.csv"),
+                index_col=[0,1,2,3,4,5]
+            )
+        else:
+            # if it does not exist, then we need to generate it
+            logging.info(f"Retrieving the {year} natural gas life cycle inventory by basin.")
+            # this step will require downloading files from edx      
+            # retrieve ng model
+            # check if model is data_dir
+            if os.path.exists(os.path.join(data_folder, "ng_model_2020Rev1.xlsx")):
+                logging.info(f"NG model already exists in your data directory.")
+                excel_file_path = os.path.join(data_folder, "ng_model_2020Rev1.xlsx")
+            else:
+                # download model from edx
+                logging.info(f"Downloading natural gas model from EDx.")
+                edx_api = model_specs.edx_api_key
+                r_id_ng_2020rev1 = 'cb8c8cf2-47ce-4ff0-b285-be73ba9294b9' 
+                # resource id of 2020 Rev1 ng model on EDx
+                try:
+                    download_edx(resource_id = r_id_ng_2020rev1, api_key = edx_api, output_dir = data_folder)
+                    excel_file_path = os.path.join(data_folder, "Appendix_F_2020_Full_Inventory_Results_US_Avg_ProdThruTrans.xlsx")
+                except Exception as e:
+                    logging.error(f"Error downloading natural gas model from EDx. Error: {e}")
+                    sys.exit(1)
+            # retrieve flow mapping document from edx [elci.csv]
+            # check if flowmapping csv exists in data_dir
+            if os.path.exists(os.path.join(data_folder, "elci.csv")):
+                logging.info(f"ELCI flow mapping document already exists in your data directory.")
+                flow_mapping_path = os.path.join(data_folder, "elci.csv")
+            else:
+                # download flowmapping document from edx
+                logging.info(f"Downloading ELCI flow mapping document from EDx.")
+                r_id_elci = 'e2c8f934-e95e-470a-879b-17ebe4afd39e' # resource id of elci flow mapping document on EDx
+                try:
+                    download_edx(resource_id = r_id_elci, api_key = edx_api, output_dir = data_folder)
+                    flow_mapping_path = os.path.join(data_folder, "elci.csv")  
+                except Exception as e:
+                    logging.error(f"Error downloading ELCI flow mapping document from EDx. Error: {e}")
+                    sys.exit(1)
+            # production sheet name
+            production_sheet_name = '2020 Production Shares'
+            # run the generate_ng_lci function and save it in data_dir
+            try:
+                generate_lci (technobasins_basins, excel_file_path, flow_mapping_path, production_sheet_name, destination_path = data_folder, final_table_name = "ng_lci_2020rev1")
+                ng_lci = pd.read_csv(
+                    os.path.join(data_folder, "ng_lci_2020rev1.csv"),
+                    index_col=[0,1,2,3,4,5]
+                )
+            except Exception as e:
+                logging.error(f"Error generating natural gas life cycle inventory. Error: {e}")
+                sys.exit(1)
+    return ng_lci
+
+def generate_lci(technobasins_basins, excel_file_path, flow_mapping_path, production_sheet_name, destination_path, final_table_name):
+    """
+    This function reads an excel file, extracts the data, and generates a LCI for NG with the same format as the currently used file.
+
+    Args:
+        technobasins_basins (dict): A dictionary that maps technobasins to basins
+        excel_file_path (str): The path to the excel file
+        production_sheet_name (str): The name of the sheet that contains the production shares
+        destination_path (str): !!This is an optional input!! 
+                                The path to the destination folder. If not provided, the function 
+                                will save the file in the current working directory.
+        final_table_name (str): The name of the final table to be saved
+                                Optional input. If not provided, the function will save the file with the name 'final_table.xlsx'.
+
+    Returns:
+        final_table (pd.DataFrame): A dataframe with the LCI for NG with the same format as the currently used file.
+
+    Notes:
+        - The function is senstive to the naming convention of the technobasins in the excel file.
+        - The current naming convention is: 'FI - <basin> <type>'. 
+        - Specifically, the current script is set up for the following sheet names:
+            - 'FI - App Shale', 'FI - Alaska Offshore', 'FI - Anadarko Conv', 'FI - Anadarko Shale', 'FI - Anadarko Tight', 
+            'FI - Arkla Conv', 'FI - Arkla Shale', 'FI - Arkla Tight', 'FI - Arkoma Conv', 'FI - Arkoma Shale', 'FI - East Texas Conv', 
+            'FI - East Texas Shale', 'FI - East Texas Tight', 'FI - Fort Worth Shale', 'FI - Green River Conv', 'FI - Green River Tight', 
+            'FI - Gulf Conv', 'FI - Gulf Shale', 'FI - Gulf TIght', 'FI - Permian Conv', 'FI - Permian Shale', 'FI - Piceance Tight', 
+            'FI - San Juan CBM', 'FI - San Juan Shale', 'FI - South OK Shale', 'FI - Strawn Shale', 'FI - Uinta Conv', 'FI - Uinta Tight', 
+            'FI - GoM Offshore'
+    """
+    # 0. Develop dictionary for basin, technobasins, and production shares
+    technobasins_basins = final_dictionary (technobasins_basins, excel_file_path, production_sheet_name)
+    print(technobasins_basins)
+
+    final_table = pd.DataFrame()
+
+    # 1. Read excel file
+    input_data = pd.ExcelFile(excel_file_path)
+    sheet_names = input_data.sheet_names
+    sheet_names = [name for name in sheet_names if name.startswith("FI")]
+    sheet_names = sheet_names[1:] # Drop the US Average sheet
+
+    # Get unused ground and water emissions based on average US emissions "FI - US Average"
+    unused_ground_emissions, unused_water_emissions = get_unused_flows(excel_file_path, "FI - US Average")
+
+    for sheet in sheet_names:
+        # Extract air, water, and ground emissions data for the selected sheet (i.e., technobasin)
+        air_emissions_data, water_emissions_data, ground_emissions_data = read_technobasin_data(excel_file_path, sheet)
+        
+        # Air emissions Get the correct flow names, compartment, and uuid for each flow
+        full_air_emissions_data = correct_netl_flow_names(air_emissions_data, flow_mapping_path)
+        full_air_emissions_data = full_air_emissions_data[full_air_emissions_data['FlowUUID'].notna()] # drop rows with FlowUUID NaN
+        
+        # Water emissions - drop unused flows
+        if unused_water_emissions is not None:
+            for flow in unused_water_emissions['FlowName']:
+                water_emissions_data = water_emissions_data.drop(water_emissions_data[water_emissions_data['FlowName'] == flow].index)        
+        # Water emissions - get the correct flow names, compartment, and uuid for each flow
+        full_water_emissions_data = correct_netl_flow_names(water_emissions_data, flow_mapping_path)
+        full_water_emissions_data = full_water_emissions_data[full_water_emissions_data['FlowUUID'].notna()] # drop rows with FlowUUID NaN
+        
+        # Ground emissions - drop unused flows
+        if unused_ground_emissions is not None:
+            for flow in unused_ground_emissions['FlowName']:
+                ground_emissions_data = ground_emissions_data.drop(ground_emissions_data[ground_emissions_data['FlowName'] == flow].index)
+        # Ground emissions - get the correct flow names, compartment, and uuid for each flow
+        full_ground_emissions_data = correct_netl_flow_names(ground_emissions_data, flow_mapping_path)
+        full_ground_emissions_data = full_ground_emissions_data[full_ground_emissions_data['FlowUUID'].notna()] # drop rows with FlowUUID NaN
+
+        # combine dataframes
+        df1 = pd.concat([full_air_emissions_data, full_water_emissions_data, full_ground_emissions_data])
+        df1 = df1.sort_values(by='FlowUUID') # sort by FlowUUID
+        basin_name = find_basin (technobasins_basins, sheet)
+        df1['FlowAmount'] = df1['FlowAmount'].astype(float)
+        df1['FlowAmount'] = df1['FlowAmount'].fillna(0)
+        norm_value = get_normalized_values(technobasins_basins, sheet)
+        df1['norm'] = norm_value
+        df1['norm'] = df1['norm'].astype(float)
+        df1['normalized_emissions'] = df1['FlowAmount'] * df1['norm']
+
+        # create final_table structure in 1st iteration
+        if final_table.empty:
+            final_table = df1[['FlowName', 'Compartment', 'Unit', 'input', 'FlowUUID']]
+            final_table = final_table.sort_values(by='FlowUUID')
+            final_table ['flow_type'] = 'ELEMENTARY_FLOW'
+            #reorder and rename columns
+            final_table = final_table[['Compartment', 'FlowName', 'FlowUUID', 'Unit', 'flow_type', 'input']]
+            final_table.columns = ['compartment', 'flow_name', 'uuid', 'unit', 'flow_type', 'is_input']
+            # add a column for each basin
+            basins_columns = list (technobasins_basins.keys())
+            for basin in basins_columns:
+                final_table[basin] = 0
+        final_table.head()
+        final_table.shape
+        
+        # Compute normalized emissions and add to final table   
+        try:
+            final_table['normalized_emissions'] = df1['normalized_emissions'].values
+            final_table[basin_name] += final_table['normalized_emissions']
+            final_table = final_table.drop(columns=['normalized_emissions'])
+        except Exception as e:
+            sys.exit(f"Error reading sheet. Make sure your excel file follows the correct naming convention.For reference, refer to the source code, lines 70-78. Error: {e}")
+
+
+    # 2. Save final table to excel
+    save_ng_lci(final_table, final_table_name ,destination_path)
+    print(f"Final table saved to {destination_path}/{final_table_name}.csv")
+    
+    return final_table
+
+def get_unused_flows(excel_file_path, sheet_name):
+    """
+    This function extracts the unused ground and water emissions from a given natural gas results dataset
+
+    Inputs:
+    - excel_file_path: path to the excel file
+    - sheet_name: name of the sheet to extract the data from
+
+    Outputs:
+    - unused_ground_emissions: dataframe containing the unused ground emissions
+    - unused_water_emissions: dataframe containing the unused water emissions
+    """
+    us_average_data = pd.read_excel(excel_file_path, sheet_name=sheet_name,skiprows=0,header=None) 
+    us_average_data.iloc[0] = us_average_data.iloc[0].ffill()
+    us_average_data.iloc[1] = us_average_data.iloc[1].ffill()
+    us_average_data.columns = us_average_data.iloc[2]
+    us_average_data = us_average_data.drop(columns=["P2.5", "P97.5"])
+    us_average_data.columns = us_average_data.iloc[0]
+    us_average_data = us_average_data.drop(us_average_data.index[0])
+    #extract ground data from us_average sheet
+    ground_emissions_data = us_average_data.iloc[:, [us_average_data.shape[1]-3, us_average_data.shape[1]-2]]
+    ground_emissions_data.columns.values[0] = "FlowName"
+    ground_emissions_data.columns.values[1] = "FlowAmount"
+    ground_emissions_data = ground_emissions_data.dropna()
+    ground_emissions_data = ground_emissions_data.iloc[1:]
+    #extract water data from us_average sheet
+    water_emissions_data = us_average_data.iloc[:, [us_average_data.shape[1]-3, us_average_data.shape[1]-1]]
+    water_emissions_data.columns.values[0] = "FlowName"
+    water_emissions_data.columns.values[1] = "FlowAmount"
+    water_emissions_data = water_emissions_data.iloc[2:]
+    water_emissions_data = water_emissions_data.dropna()
+    #unused ground emissions
+    unused_ground_emissions = ground_emissions_data[ground_emissions_data['FlowAmount'] == 0.00e+00]
+    #unused water emissions
+    unused_water_emissions = water_emissions_data[water_emissions_data['FlowAmount'] == 0.00e+00]
+
+    return unused_ground_emissions, unused_water_emissions
+
+def read_technobasin_data(excel_file_path, sheet_name):
+    """
+    This function reads an excel file, extracts the data, and generates a df for NG emissions for air, water, and ground.
+    The df includes the flow name and flow amount (P2.5 and P97.5 values are dropped).
+
+    Inputs:
+    - excel_file_path: path to the excel file
+    - sheet_name: name of the sheet to extract the data from
+
+    Outputs:
+    - air_emissions_data: dataframe containing the air emissions data
+    - water_emissions_data: dataframe containing the water emissions data
+    - ground_emissions_data: dataframe containing the ground emissions data
+    """
+    print(f"Processing sheet: {sheet_name}")
+    # create empty database
+    df = pd.DataFrame()
+    # Extract all the data from the sheet
+    df = pd.read_excel(excel_file_path, sheet_name=sheet_name, skiprows=0, header=None)
+    # Adjustments: 1) changing header, 2) dropping P2.5 and P97.5 columns
+    df.iloc[0] = df.iloc[0].ffill()
+    df.iloc[1] = df.iloc[1].ffill()
+    df.columns = df.iloc[2]
+    df = df.drop(columns=["P2.5", "P97.5"])
+    df.columns = df.iloc[0]
+    df = df.drop(df.index[0])
+    # separate water, soil, ground, and air emissions - and map them to FEDEFL elementary flows
+    # Air emissions
+    air_emissions_data = df.drop(columns=[col for col in df.columns if col != df.columns[1]])  
+    air_emissions_data = air_emissions_data.iloc[:, :-2]    # drop the last two columns (empty columns from excel)
+    air_emissions_data[f'FlowAmount'] = air_emissions_data.iloc[:, 1:11].sum(axis=1)  # sum columns 2:11 for each row
+    air_emissions_data = air_emissions_data.iloc[2:]
+    air_emissions_data = air_emissions_data.iloc[:, [0,-1]]
+    air_emissions_data['Compartment'] = 'Air' # add compartment
+    air_emissions_data.columns.values[0] = 'FlowName' # change header
+    air_emissions_data['Unit'] = 'kg' # add unit
+    air_emissions_data ['input'] = False # add input
+    # Water emissions
+    water_emissions_data = df.iloc[:, [df.shape[1]-3, df.shape[1]-1]]
+    water_emissions_data.columns.values[0] = "FlowName"
+    water_emissions_data.columns.values[1] = "FlowAmount"
+    water_emissions_data = water_emissions_data.iloc[2:]
+    water_emissions_data = water_emissions_data.dropna()
+    water_emissions_data['Compartment'] = 'Water'
+    water_emissions_data['Unit'] = 'kg'
+    water_emissions_data ['input'] = False
+    # Ground emissions
+    ground_emissions_data = df.iloc[:, [df.shape[1]-3, df.shape[1]-2]]
+    ground_emissions_data.columns.values[0] = "FlowName"
+    ground_emissions_data.columns.values[1] = "FlowAmount"
+    ground_emissions_data = ground_emissions_data.dropna()
+    ground_emissions_data = ground_emissions_data.iloc[1:]
+    ground_emissions_data['Compartment'] = 'Ground'
+    ground_emissions_data['Unit'] = 'kg'
+    ground_emissions_data ['input'] = False
+
+    return air_emissions_data, water_emissions_data, ground_emissions_data
+
+
+# Helper function to calculate normalized values for each technobasin
+def get_normalized_values(technobasins_basins, technobasin):
+    for outer, inner in technobasins_basins.items():
+        if technobasin in inner:
+            total = sum(inner.values())
+            return float(inner[technobasin] / total)
+    return None
+
+# helper function to find basins for a given technobasin
+def find_basin(technobasins_basins, technobasin_name):
+    for outer, inner in technobasins_basins.items():
+        if technobasin_name in inner:
+            return outer
+    return None
+
+# Helper function to use aliases to normalize technobasin naming
+def _normalize_technobasin_naming(name):
+    name_lower = name.lower().strip()
+    
+    # Check exact or partial match
+    for alias, canonical in aliases.items():
+        alias_clean = alias.lower()
+        if name_lower in alias_clean or alias_clean in name_lower:
+            return canonical
+
+# Helper function to create the final dictionary including basin, technobasin, and production share
+def final_dictionary(technobasins_basins, excel_file_path, production_sheet_name):
+    production_shares_2020 = pd.read_excel(excel_file_path, sheet_name=production_sheet_name)
+    production_shares_2020 = production_shares_2020.iloc[1:]
+    production_shares_2020['Scenario Normalized'] = production_shares_2020['Scenario'].apply(lambda x: _normalize_technobasin_naming(x))
+    production_shares_2020 = production_shares_2020.drop(columns=production_shares_2020.columns[0])
+    production_shares_2020.columns.values[1] = 'Scenario'
+    production_shares_2020 = production_shares_2020[['Scenario', 'Production Shares (%)']]
+    # final dictionary including basin, technobasin, and production share
+    technobasins_basins = {
+        key: {num: production_shares_2020.set_index('Scenario').loc[num, 'Production Shares (%)'] for num in nums}
+        for key, nums in technobasins_basins.items()
+    }
+    return technobasins_basins
+
+def save_ng_lci(df, filename, destination_path):
+    """
+    This function saves the final table to an excel file.
+    """
+    if destination_path is None:
+        destination_path = f"{os.getcwd()}/"
+    if filename is None:
+        filename = 'final_table'
+    full_path = os.path.join(destination_path, f"{filename}.csv")
+    df.to_csv(full_path, index=False)
+
+def correct_netl_flow_names(df, flow_mapping_path, amount_col="FlowAmount"):
+    """A helper method that replaces NETL air, water, and ground emissions
+    with Federal Elementary Flow List equivalents based on a subset of
+    flows defined in USEPA's eLCI mapping using the Python package
+    `fedelemflowlist <https://github.com/USEPA/fedelemflowlist>`_
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        A life cycle inventory data frame with columns, 'FlowName',
+        'Compartment', 'Unit', and ``amount_col``.
+    amount_col : str, optional
+        The column title representing the flow amount, by default "FlowAmount"
+
+    Returns
+    -------
+    pandas.DataFrame
+        A new data frame with the same number of rows and columns as the
+        sent data frame. Flow names, compartments, units, and flow amounts
+        are updated based on emissions matches with the FEDEFL. All unmatched
+        flows are returned 'as is'. If FlowUUID was not in the column list,
+        it is created; otherwise, the matched UUIDs are updated.
+    """
+    # This data frame has about 4k source flow names and contexts associated
+    # with NETL unit process models (e.g., petro, nuclear, coal).
+    flow_mapping = pd.read_csv(flow_mapping_path, encoding='ISO-8859-1')
+
+    # Matching occurs on name, compartment and units; help this along by
+    # lowering the case (improves coal UP matches from 10% to 42%).
+    df["FlowName_orig"] = df["FlowName"]
+    df["Compartment_orig"] = df["Compartment"]
+    df["FlowName"] = df["FlowName"].str.lower().str.rstrip()
+    df["Compartment"] = df["Compartment"].str.lower().str.rstrip()
+
+    # In the map, also lower-case names and compartments and remove trailing
+    # space; note this introduces duplicate entries in the map, so remove them.
+    # The duplicates are from later entries, so ignore mapper, verifier and
+    # last updated cols when searching for duplicates. [250917; TWD]
+    flow_mapping['SourceFlowName'] = flow_mapping[
+        'SourceFlowName'].str.lower().str.rstrip()
+    flow_mapping['SourceFlowContext'] = flow_mapping[
+        "SourceFlowContext"].str.lower().str.rstrip()
+    ignore_cols = ['Mapper', 'Verifier', 'LastUpdated']
+    flow_mapping = flow_mapping.drop_duplicates(
+        subset=[x for x in flow_mapping.columns if x not in ignore_cols]
+    )
+
+    # Some compartments in NETL UPs are complex (e.g., 'Emission to water/fresh
+    # water'), but are listed simply in the FEDEFL eLCI mapper (e.g., 'emission/
+    # water'). Improves coal mining UP matches from 42% to 62%.
+    is_emission = df['input'] == False
+    is_water = df['Compartment'].str.contains('water')
+    is_air = df['Compartment'].str.contains('air')
+    is_ground = df['Compartment'].str.contains('ground')
+
+    df.loc[is_emission * is_water, 'Compartment'] = 'emission/water'
+    df.loc[is_emission * is_air, 'Compartment'] = 'emission/air'
+    df.loc[is_emission * is_ground, 'Compartment'] = 'emission/ground'
+
+    # HOTFIX: Map against source units [250205; TWD]
+    # For coal mining, reduces matches from >62% to <62% (about 2k less rows)
+    logging.info("Mapping emissions to FEDEFL")
+    mapped_df = pd.merge(
+        df,
+        flow_mapping,
+        left_on=["FlowName", "Compartment", "Unit"],
+        right_on=["SourceFlowName", "SourceFlowContext", "SourceUnit"],
+        how="left",
+    )
+
+    # If TargetFlowName is present, there was a match.
+    is_match = mapped_df["TargetFlowName"].notnull()
+    logging.info("Correcting %d NETL flows" % is_match.sum())
+
+    # Quality Check (coal_df)
+    #   Check that target unit matches source unit.
+    #   No! Hydrogen, Uranium, and Lead-210/kg have mis-matched units.
+    #   Therefore, unit conversions are necessary.
+
+    # Return flow names and compartments back to their original values.
+    df["FlowName"] = df["FlowName_orig"]
+    df["Compartment"] = df["Compartment_orig"]
+    del df['FlowName_orig']      # use this syntax since you're editing
+    del df['Compartment_orig']   # a reference object that isn't returned
+    mapped_df['FlowName'] = mapped_df['FlowName_orig']
+    mapped_df["Compartment"] = mapped_df["Compartment_orig"]
+    mapped_df = mapped_df.drop(columns=['FlowName_orig', 'Compartment_orig'])
+
+    # Replace FlowName, Unit, and Compartment with new names (where matched)
+    mapped_df.loc[is_match, "FlowName"] = mapped_df.loc[
+        is_match, "TargetFlowName"]
+    mapped_df.loc[is_match, "Compartment"] = mapped_df.loc[
+        is_match, "TargetFlowContext"]
+    mapped_df.loc[is_match, "Unit"] = mapped_df.loc[is_match, "TargetUnit"]
+
+    # Correct values using the conversion factor
+    mapped_df.loc[is_match, amount_col] *= mapped_df.loc[
+        is_match, 'ConversionFactor']
+
+    if 'FlowUUID' in mapped_df.columns:
+        # Update existing values with new UUIDs
+        mapped_df.loc[is_match, 'FlowUUID'] = mapped_df.loc[
+            is_match, 'TargetFlowUUID']
+    else:
+        # Set UUIDs to target values
+        mapped_df = mapped_df.rename(columns={"TargetFlowUUID": "FlowUUID"})
+
+    # Drop all unneeded cols
+    drop_cols = [x for x in flow_mapping.columns if x in mapped_df.columns]
+    mapped_df = mapped_df.drop(columns=drop_cols)
+
+    return mapped_df
 
 ##############################################################################
 # MAIN

From 5f1b99a38032620c36e6d2a95e85865e31e81edf Mon Sep 17 00:00:00 2001
From: Francis Hanna <hannafra@msu.edu>
Date: Mon, 20 Oct 2025 12:44:01 -0400
Subject: [PATCH 2/6] Updated ng model from basin to region basis - Updated the
 2020 ng model to use the region basis - Implemented additional changes to map
 eia plants to regions - Updated the code to use basin mapping for 2016 and
 region basis for 2020 ng model

---
 electricitylci/natural_gas_upstream.py | 571 ++++++++++++-------------
 1 file changed, 274 insertions(+), 297 deletions(-)

diff --git a/electricitylci/natural_gas_upstream.py b/electricitylci/natural_gas_upstream.py
index b5a1b5bf..92dd9a95 100644
--- a/electricitylci/natural_gas_upstream.py
+++ b/electricitylci/natural_gas_upstream.py
@@ -35,64 +35,35 @@
     "generate_upstream_ng",
 ]
 
-
-#############################################################################
-# GLOBALS
-##############################################################################
-technobasins_basins = {
-    'Appalachian': ['FI - App Shale'],
-    'Alaska Offshore': ['FI - Alaska Offshore'],
-    'Anadarko': ['FI - Anadarko Conv','FI - Anadarko Shale', 'FI - Anadarko Tight'],
-    'Arkla': ['FI - Arkla Conv','FI - Arkla Shale','FI - Arkla Tight'],
-    'Arkoma': ['FI - Arkoma Conv','FI - Arkoma Shale'],
-    'East Texas': ['FI - East Texas Conv', 'FI - East Texas Shale', 'FI - East Texas Tight'],
-    'Fort Worth': ['FI - Fort Worth Shale'],
-    'Green River': ['FI - Green River Conv', 'FI - Green River Tight'],
-    'Gulf': ['FI - Gulf Conv', 'FI - Gulf Shale', 'FI - Gulf TIght'], ## This not a typo - the title of the sheet in the excel file is 'FI - Gulf TIght'
-    'Permian': ['FI - Permian Conv', 'FI - Permian Shale'],
-    'Piceance': ['FI - Piceance Tight'],
-    'San Juan': ['FI - San Juan CBM', 'FI - San Juan Shale'],
-    'South Oklahoma': ['FI - South OK Shale'],
-    'Strawn': ['FI - Strawn Shale'],
-    'Uinta': ['FI - Uinta Conv', 'FI - Uinta Tight'],
-    'GoM': ['FI - GoM Offshore']
-}   
-
-# Aliases to account for different naming conventions of technobasins used in the excel file
-# the below dictionary is hardcoded
-
-aliases = {
-    'Appalachian Shale': 'FI - App Shale',
-    'Alaska Offshore': 'FI - Alaska Offshore',
-    'GoM Offshore': 'FI - GoM Offshore',
-    'Arkla Shale': 'FI - Arkla Shale',
-    'Arkla Tight': 'FI - Arkla Tight',
-    'Green River Conv': 'FI - Green River Conv',
-    'Green River Tight': 'FI - Green River Tight',
-    'Permian Conv': 'FI - Permian Conv',
-    'Gulf Tight': 'FI - Gulf TIght', ## This not a typo - the title of the sheet in the excel file is 'FI - Gulf TIght'
-    'Uinta Conv': 'FI - Uinta Conv',
-    'Gulf Conv': 'FI - Gulf Conv',
-    'Gulf Shale': 'FI - Gulf Shale',
-    'Permian Shale': 'FI - Permian Shale',
-    'Anadarko Shale': 'FI - Anadarko Shale',
-    'South Oklahoma Shale': 'FI - South OK Shale',
-    'Uinta Tight': 'FI - Uinta Tight',
-    'East Texas Tight': 'FI - East Texas Tight',
-    'East Texas Shale': 'FI - East Texas Shale',
-    'Strawn Shale': 'FI - Strawn Shale',
-    'Piceance Tight': 'FI - Piceance Tight',
-    'Fort Worth Shale': 'FI - Fort Worth Shale',
-    'Arkla Conv': 'FI - Arkla Conv',
-    'East Texas Conv': 'FI - East Texas Conv',
-    'Arkoma Shale': 'FI - Arkoma Shale',
-    'Anadarko Conv': 'FI - Anadarko Conv',
-    'San Juan CBM': 'FI - San Juan CBM',
-    'Anadarko Tight': 'FI - Anadarko Tight',
-    'Arkoma Conv': 'FI - Arkoma Conv',
-    'San Juan Shale': 'FI - San Juan Shale'
+# Supporting Dicts
+# #######################################################################################################
+region_sheets_dict = {
+    'Pacific': 'FI - Pacific Delivery',
+    'Rocky Mountain': 'FI - Rocky Mountain Delivery',
+    'Southwest': 'FI - Southwest Delivery',
+    'Midwest': 'FI - Midwest Delivery',
+    'Southeast': 'FI - Southeast Delivery',
+    'Northeast': 'FI - Northeast Delivery'
+ }
+
+r_ids_2020 = {
+    'Appendix_F_2020_Full_Inventory_Results_Midwest_ProdThruTrans.xlsx':'5665de40-fc2b-4643-b647-ceec226af2bb', 
+    'Appendix_F_2020_Full_Inventory_Results_Northeast_ProdThruTrans.xlsx' :'b396eb50-72ac-45f0-8231-9b613457c6d8', 
+    'Appendix_F_2020_Full_Inventory_Results_Pacific_ProdThruTrans.xlsx' :'347a0cd8-5ff2-4cb3-be0a-f31a56bac9c6', 
+    'Appendix_F_2020_Full_Inventory_Results_Rocky_Mountain_ProdThruTrans.xlsx' :'d08f4da2-543a-40b2-9ffd-c7138ed4f8c6', 
+    'Appendix_F_2020_Full_Inventory_Results_Southeast_ProdThruTrans.xlsx' :'4590712b-db21-4428-b488-6ded3b65d18b', 
+    'Appendix_F_2020_Full_Inventory_Results_Southwest_ProdThruTrans.xlsx':'9dd7a6e5-df1a-461e-87e7-0b9d8d600f26'
 }
 
+region_state_mapping = {
+    'WA':'Pacific','CA':'Pacific','OR':'Pacific','MT':'Rocky Mountain','ID':'Rocky Mountain','CO':'Rocky Mountain','NV':'Rocky Mountain','UT':'Rocky Mountain','WY':'Rocky Mountain',
+    'AZ':'Southwest','NM':'Southwest','OK':'Southwest','TX':'Southwest','MN':'Midwest','ND':'Midwest','IA':'Midwest','KS':'Midwest',
+    'MO':'Midwest','NE':'Midwest','SD':'Midwest','IL':'Midwest','IN':'Midwest','OH':'Midwest','WI':'Midwest','MI':'Midwest',
+    'AR':'Southeast','LA':'Southeast','AL':'Southeast','FL':'Southeast','GA':'Southeast','MS':'Southeast','SC':'Southeast','KY':'Southeast',
+    'NC':'Southeast','TN':'Southeast','VA':'Southeast','WV':'Southeast','DE':'Southeast','MD':'Southeast','CT':'Northeast','MA':'Northeast',
+    'NH':'Northeast','RI':'Northeast','VT':'Northeast','NJ':'Northeast','NY':'Northeast','PA':'Northeast','ME':'Northeast',
+} #TOTAL 48 -- EXCLUDING AL, HI, AND DC
+
 ##############################################################################
 # MAN FUNCTION
 ##############################################################################
@@ -124,134 +95,243 @@ def generate_upstream_ng(year):
     """
     logging.info("Generating natural gas inventory")
 
-    # Get the EIA generation data for the specified year, this dataset includes
-    # the fuel consumption for generating electricity for each facility
-    # and fuel type. Filter the data to only include NG facilities and on
-    # positive fuel consumption. Group that data by Plant Id as it is possible
-    # to have multiple rows for the same facility and fuel based on different
-    # prime movers (e.g., gas turbine and combined cycle).
-    eia_generation_data = eia923_download_extract(year)
-
-    column_filt = ((eia_generation_data['Reported Fuel Type Code'] == 'NG') &
-                   (eia_generation_data['Total Fuel Consumption MMBtu'] > 0))
-    ng_generation_data = eia_generation_data[column_filt]
-
-    ng_generation_data = ng_generation_data.groupby('Plant Id').agg(
-        {'Total Fuel Consumption MMBtu':'sum'}).reset_index()
-    ng_generation_data['Plant Id'] = ng_generation_data['Plant Id'].astype(int)
-
-    # Import the mapping file which has the source gas basin for each Plant Id.
-    # NOTE:
-    #   This is a 2 MB file that provides about 100 kB of info!
-    ng_basin_mapping = pd.read_csv(
-        os.path.join(data_dir, 'gas_supply_basin_mapping.csv')
-    )
-    subset_cols = ['Plant Code', 'NG_LCI_Name']
-    ng_basin_mapping = ng_basin_mapping[subset_cols]
+    # get plant data and map each plant to its ng source: basin or region
+    # the 2016 ng emissions inventory is only available by basin
+    #   as such, plants can only be connected to upstream emissions via basin assignment
+    # newer data (2020) is available by region 
+    #   plants are connected to upstream ng emissions via region assignment
 
-    # Merge with ng_generation dataframe.
-    ng_generation_data_basin = pd.merge(
-        left = ng_generation_data,
-        right = ng_basin_mapping,
-        left_on = 'Plant Id',
-        right_on = 'Plant Code'
-    )
-    ng_generation_data_basin = ng_generation_data_basin.drop(
-        columns=['Plant Code']
-    )
+    if model_specs.ng_model_year == 2016:
+        ng_generation_data_mapped = map_ng_by_basin(year) # 'year' refers to eia_generation_year
+    else:
+        ng_generation_data_mapped = map_ng_by_region(year) # 'year' refers to eia_generation_year
 
     # Read the NG LCI file
-    # if year = 2016 - this step will directly ready NG_LCI.csv from the data_dir
-    # if year = 2020 - this step will require edx api, download ng model and mapping 
+    # if year = 2016 - this step will directly ready NG_LCI.csv from the data_dir - returns lci (by basin)
+    # if year = 2020 - this step will require edx api, download ng model and mapping - returns lci (by region)
     # document from edx, and generate lci
     ng_lci = get_ng_lci(model_specs.ng_model_year)
 
-    ng_lci_columns=[
-        "Compartment",
-        "FlowName",
-        "FlowUUID",
-        "Unit",
-        "FlowType",
-        "input",
-        "Basin",
-        "FlowAmount"
-    ]
-    ng_lci_stack = pd.DataFrame(ng_lci.stack()).reset_index()
-    ng_lci_stack.columns=ng_lci_columns
-
-    # Merge basin data with LCI dataset
-    ng_lci_basin = pd.merge(
-        ng_lci_stack,
-        ng_generation_data_basin,
-        left_on = 'Basin',
-        right_on = 'NG_LCI_Name',
-        how='left'
-    )
+    # merge ng lci and plants based on the common parameter: region or basin
+    if model_specs.ng_model_year == 2016:
+        ng_lci_mapped = map_ng_lci_to_plants_by_basin(ng_lci, ng_generation_data_mapped)
+    else:
+        ng_lci_mapped = map_ng_lci_to_plants_by_region(ng_lci, ng_generation_data_mapped)
 
     # Multiplying with the EIA 923 fuel consumption; conversion factor is
     # for MMBtu to MJ
     btu_to_MJ = pq.convert(10**6,'Btu','MJ')
-    ng_lci_basin["FlowAmount"]=(
-        ng_lci_basin["FlowAmount"]
-        * ng_lci_basin['Total Fuel Consumption MMBtu']
+    ng_lci_mapped["FlowAmount"]=(
+        ng_lci_mapped["FlowAmount"]
+        * ng_lci_mapped['Total Fuel Consumption MMBtu']
         * btu_to_MJ
     )
 
-    ng_lci_basin = ng_lci_basin.rename(
+    ng_lci_mapped = ng_lci_mapped.rename(
         columns={'Total Fuel Consumption MMBtu':'quantity'})
-    ng_lci_basin["quantity"]=ng_lci_basin["quantity"]*btu_to_MJ
+    ng_lci_mapped["quantity"]=ng_lci_mapped["quantity"]*btu_to_MJ
 
     # Output is kg emission for the specified year by facility Id,
     # not normalized to electricity output
 
-    ng_lci_basin['FuelCategory'] = 'GAS'
-    ng_lci_basin.rename(
+    ng_lci_mapped['FuelCategory'] = 'GAS'
+    ng_lci_mapped.rename(
         columns={
             'Plant Id':'plant_id',
+            'NG_LCI_Region': 'stage_code',
             'NG_LCI_Name':'stage_code',
             'Stage':'stage'},
         inplace=True
     )
-    ng_lci_basin["Year"] = year
-    ng_lci_basin["Source"] = "netlgaseiafuel"
-    ng_lci_basin["ElementaryFlowPrimeContext"] = "emission"
-    ng_lci_basin.loc[
-        ng_lci_basin["Compartment"].str.contains("resource/"),
+    ng_lci_mapped["Year"] = year
+    ng_lci_mapped["Source"] = "netlgaseiafuel"
+    ng_lci_mapped["ElementaryFlowPrimeContext"] = "emission"
+    ng_lci_mapped.loc[
+        ng_lci_mapped["Compartment"].str.contains("resource/"),
         "ElementaryFlowPrimeContext"] = "resource"
-    ng_lci_basin.loc[
-        ng_lci_basin["Compartment"].str.contains("Technosphere/"),
+    ng_lci_mapped.loc[
+        ng_lci_mapped["Compartment"].str.contains("Technosphere/"),
         "ElementaryFlowPrimeContext"] = "technosphere"
     # Issue #296 - adding DQI information for upstream processes
-    ng_lci_basin["Year"] = 2016
-    ng_lci_basin["DataReliability"] = 3
-    ng_lci_basin["TemporalCorrelation"] = add_temporal_correlation_score(
-        ng_lci_basin["Year"], model_specs.electricity_lci_target_year
+    ng_lci_mapped["Year"] = 2016
+    ng_lci_mapped["DataReliability"] = 3
+    ng_lci_mapped["TemporalCorrelation"] = add_temporal_correlation_score(
+        ng_lci_mapped["Year"], model_specs.electricity_lci_target_year
     )
-    ng_lci_basin["GeographicalCorrelation"] = 1
-    ng_lci_basin["TechnologicalCorrelation"] = 1
-    ng_lci_basin["DataCollection"] = 1
+    ng_lci_mapped["GeographicalCorrelation"] = 1
+    ng_lci_mapped["TechnologicalCorrelation"] = 1
+    ng_lci_mapped["DataCollection"] = 1
     #3/20/2025 MBJ - replacing renewable vintage here so that temporal correlation
     #is based on the year the inventory is based on, but when electricity
     #generation is combined, it needs to be based on the target year for the
     #inventory.
-    ng_lci_basin["Year"]=year
-
-    # Issue: the current basin-to-plant mapping document does not include the Alaska Offshore and GoM Offshore basins
-    #        on the other hand, the ng_lci generated above includes emissions for both of there basins
-    #        this causes NaN values in the 'ng_lci_basin' dataframe and then returns errors when converting to int32
-    #        a quick fix involves omitting NaN values from the 'ng_lci_basin' dataframe - but this assumes that Offshore 
-    #        gas production is not used in electricity production
-    #        A fix for the future involves updating the mapping document: 'gas_supply_basin_mapping.csv' to account for 
-    #        offshore gas used in electricity production
-    
-    ng_lci_basin = ng_lci_basin.dropna(subset=['FlowAmount'])
+    ng_lci_mapped["Year"]=year
     
-    return ng_lci_basin
+    return ng_lci_mapped
 
 ##############################################################################
 # HELPER FUNCTIONS
 ##############################################################################
 
+def map_ng_lci_to_plants_by_basin (ng_lci, ng_generation_data_mapped):
+    """
+    Map the natural gas generation data by basin.
+    """
+    ng_lci_columns=[
+        "Compartment",
+        "FlowName",
+        "FlowUUID",
+        "Unit",
+        "FlowType",
+        "input",
+        "Basin",
+        "FlowAmount"
+    ]
+    ng_lci_stack = pd.DataFrame(ng_lci.stack()).reset_index()
+    ng_lci_stack.columns=ng_lci_columns
+
+    # Merge basin data with LCI dataset
+    ng_lci_mapped = pd.merge(
+        ng_lci_stack,
+        ng_generation_data_mapped,
+        left_on = 'Basin',
+        right_on = 'NG_LCI_Name',
+        how='left'
+    )   
+    return ng_lci_mapped
+
+def map_ng_lci_to_plants_by_region (ng_lci, ng_generation_data_mapped):
+    """
+    Map the natural gas generation data by basin.
+    """
+    ng_lci_columns=[
+        "Compartment",
+        "FlowName",
+        "FlowUUID",
+        "Unit",
+        "FlowType",
+        "input",
+        "Region",
+        "FlowAmount"
+    ]
+    ng_lci_stack = pd.DataFrame(ng_lci.stack()).reset_index()
+    ng_lci_stack.columns=ng_lci_columns
+
+    # Merge basin data with LCI dataset
+    ng_lci_mapped = pd.merge(
+        ng_lci_stack,
+        ng_generation_data_mapped,
+        left_on = 'Region',
+        right_on = 'NG_LCI_Region',
+        how='left'
+    )   
+    return ng_lci_mapped
+
+
+def map_ng_by_region (year):
+    """
+    Map the natural gas generation data by region.
+    This includes 6 regions: Pacific, Rocky Mountain, Southwest, Midwest, Southeast, and Northeast.
+
+    Notes
+    -----
+    * Downloads eia plant data for the specified year
+    * Filters the data to only include NG facilities and on positive fuel consumption
+    * Groups the data by Plant Id and aggregates the fuel consumption by summing the total fuel consumption
+    * Maps each plant to a region using the region_state_mapping dictionary
+
+    Parameters
+    ----------
+    year: int, str
+        The year of the eia923 plant data to use.
+
+    Returns
+    ----------
+    pandas.DataFrame
+        A dataframe with the natural gas generation data by region.
+    """
+    if isinstance(year, str):
+        year = int(year)
+    
+    eia_generation_data = eia923_download_extract(year)
+
+    column_filt = ((eia_generation_data['Reported Fuel Type Code'] == 'NG') &
+                   (eia_generation_data['Total Fuel Consumption MMBtu'] > 0))
+
+    ng_generation_data = eia_generation_data[column_filt]
+
+    ng_generation_data = ng_generation_data.groupby('Plant Id').agg(
+        {'Total Fuel Consumption MMBtu':'sum','State':'first'}).reset_index()
+    ng_generation_data['Plant Id'] = ng_generation_data['Plant Id'].astype(int)
+
+    ng_generation_data_region = ng_generation_data.copy()
+
+    ng_generation_data_region['NG_LCI_Region'] = ng_generation_data['State'].map(region_state_mapping)
+    
+    return ng_generation_data_region
+
+
+def map_ng_by_basin (year):
+    """
+    Map the natural gas generation data by basin.
+
+    Notes
+    -----
+    * Downloads eia plant data for the specified year
+    * Filters the data to only include NG facilities and on positive fuel consumption
+    * maps each plant to a basin using the gas_supply_basin_mapping.csv file
+    
+    Parameters
+    ----------
+    year: int, str
+        The year of the eia923 plant data to use.
+
+    Returns
+    ----------
+    pandas.DataFrame
+        A dataframe with the natural gas generation data by region.
+    """
+    if isinstance(year, str):
+        year = int(year)
+    
+    # Get the EIA generation data for the specified year, this dataset includes
+    # the fuel consumption for generating electricity for each facility
+    # and fuel type. Filter the data to only include NG facilities and on
+    # positive fuel consumption. Group that data by Plant Id as it is possible
+    # to have multiple rows for the same facility and fuel based on different
+    # prime movers (e.g., gas turbine and combined cycle).
+
+    eia_generation_data = eia923_download_extract(year)
+
+    column_filt = ((eia_generation_data['Reported Fuel Type Code'] == 'NG') &
+                   (eia_generation_data['Total Fuel Consumption MMBtu'] > 0))
+    ng_generation_data = eia_generation_data[column_filt]
+
+    ng_generation_data = ng_generation_data.groupby('Plant Id').agg(
+        {'Total Fuel Consumption MMBtu':'sum'}).reset_index()
+    ng_generation_data['Plant Id'] = ng_generation_data['Plant Id'].astype(int)
+
+    # Import the mapping file which has the source gas basin for each Plant Id.
+    # NOTE:
+    #   This is a 2 MB file that provides about 100 kB of info!
+    ng_basin_mapping = pd.read_csv(
+        os.path.join(data_dir, 'gas_supply_basin_mapping.csv')
+    )
+    subset_cols = ['Plant Code', 'NG_LCI_Name']
+    ng_basin_mapping = ng_basin_mapping[subset_cols]
+
+    # Merge with ng_generation dataframe.
+    ng_generation_data_basin = pd.merge(
+        left = ng_generation_data,
+        right = ng_basin_mapping,
+        left_on = 'Plant Id',
+        right_on = 'Plant Code'
+    )
+    ng_generation_data_basin = ng_generation_data_basin.drop(
+        columns=['Plant Code']
+    )
+    return ng_generation_data_basin
+
 def get_ng_lci(year):
     """
     Get the natural gas life cycle inventory for a given year.
@@ -288,7 +368,11 @@ def get_ng_lci(year):
         )
     else:
         data_folder = os.path.join(paths.local_path, 'netl')
-        #check if the ng_lci_2020rev1.csv already exists - if it does then we can skip all the below
+        # create new directory for ng if non existing
+        if not os.path.exists(os.path.join(data_folder,"2020_ng")):
+            os.makedirs(os.path.join(data_folder,"2020_ng"))
+        data_folder = os.path.join(data_folder,"2020_ng")
+        # check if the ng_lci_2020rev1.csv already exists - if it does then we can skip all the below
         if os.path.exists(os.path.join(data_folder, "ng_lci_2020rev1.csv")):
             logging.info(f"NG LCI already exists in your data directory.")
             ng_lci = pd.read_csv(
@@ -297,25 +381,25 @@ def get_ng_lci(year):
             )
         else:
             # if it does not exist, then we need to generate it
-            logging.info(f"Retrieving the {year} natural gas life cycle inventory by basin.")
+            logging.info(f"Retrieving the {year} natural gas life cycle inventory by region.")
             # this step will require downloading files from edx      
             # retrieve ng model
             # check if model is data_dir
-            if os.path.exists(os.path.join(data_folder, "ng_model_2020Rev1.xlsx")):
-                logging.info(f"NG model already exists in your data directory.")
-                excel_file_path = os.path.join(data_folder, "ng_model_2020Rev1.xlsx")
+            if not os.path.exists(os.path.join(data_folder,"2020_ng_model")):
+                os.makedirs(os.path.join(data_folder,"2020_ng_model"))
+                model_folder = os.path.join(data_folder,"2020_ng_model")
             else:
-                # download model from edx
-                logging.info(f"Downloading natural gas model from EDx.")
-                edx_api = model_specs.edx_api_key
-                r_id_ng_2020rev1 = 'cb8c8cf2-47ce-4ff0-b285-be73ba9294b9' 
-                # resource id of 2020 Rev1 ng model on EDx
-                try:
-                    download_edx(resource_id = r_id_ng_2020rev1, api_key = edx_api, output_dir = data_folder)
-                    excel_file_path = os.path.join(data_folder, "Appendix_F_2020_Full_Inventory_Results_US_Avg_ProdThruTrans.xlsx")
-                except Exception as e:
-                    logging.error(f"Error downloading natural gas model from EDx. Error: {e}")
-                    sys.exit(1)
+                model_folder = os.path.join(data_folder,"2020_ng_model")
+                for ngmodel in r_ids_2020.keys():
+                    if os.path.exists(os.path.join(model_folder, ngmodel)):
+                        logging.info(f"{ngmodel} already exists in your data directory.")
+                    else:
+                        logging.info(f"Downloading {ngmodel} from EDx.")
+                        try:
+                            download_edx(resource_id = r_ids_2020[ngmodel], api_key = model_specs.edx_api_key, output_dir = model_folder)
+                        except Exception as e:
+                            logging.error(f"Error downloading {ngmodel} from EDx. Error: {e}")
+                            sys.exit(1)
             # retrieve flow mapping document from edx [elci.csv]
             # check if flowmapping csv exists in data_dir
             if os.path.exists(os.path.join(data_folder, "elci.csv")):
@@ -326,16 +410,15 @@ def get_ng_lci(year):
                 logging.info(f"Downloading ELCI flow mapping document from EDx.")
                 r_id_elci = 'e2c8f934-e95e-470a-879b-17ebe4afd39e' # resource id of elci flow mapping document on EDx
                 try:
-                    download_edx(resource_id = r_id_elci, api_key = edx_api, output_dir = data_folder)
+                    download_edx(resource_id = r_id_elci, api_key = model_specs.edx_api_key, output_dir = data_folder)
                     flow_mapping_path = os.path.join(data_folder, "elci.csv")  
                 except Exception as e:
                     logging.error(f"Error downloading ELCI flow mapping document from EDx. Error: {e}")
                     sys.exit(1)
-            # production sheet name
-            production_sheet_name = '2020 Production Shares'
+
             # run the generate_ng_lci function and save it in data_dir
             try:
-                generate_lci (technobasins_basins, excel_file_path, flow_mapping_path, production_sheet_name, destination_path = data_folder, final_table_name = "ng_lci_2020rev1")
+                generate_lci (excel_folder_path = model_folder, flow_mapping_path = flow_mapping_path, destination_path = data_folder, final_table_name = "ng_lci_2020rev1")
                 ng_lci = pd.read_csv(
                     os.path.join(data_folder, "ng_lci_2020rev1.csv"),
                     index_col=[0,1,2,3,4,5]
@@ -345,14 +428,13 @@ def get_ng_lci(year):
                 sys.exit(1)
     return ng_lci
 
-def generate_lci(technobasins_basins, excel_file_path, flow_mapping_path, production_sheet_name, destination_path, final_table_name):
+def generate_lci(excel_folder_path, flow_mapping_path, destination_path, final_table_name):
     """
     This function reads an excel file, extracts the data, and generates a LCI for NG with the same format as the currently used file.
 
     Args:
-        technobasins_basins (dict): A dictionary that maps technobasins to basins
-        excel_file_path (str): The path to the excel file
-        production_sheet_name (str): The name of the sheet that contains the production shares
+        excel_folder_path (str): The path to the folder containing the excel files (ng models/inventories)
+        flow_mapping_path (str): The path to the flow mapping file
         destination_path (str): !!This is an optional input!! 
                                 The path to the destination folder. If not provided, the function 
                                 will save the file in the current working directory.
@@ -363,51 +445,32 @@ def generate_lci(technobasins_basins, excel_file_path, flow_mapping_path, produc
         final_table (pd.DataFrame): A dataframe with the LCI for NG with the same format as the currently used file.
 
     Notes:
-        - The function is senstive to the naming convention of the technobasins in the excel file.
-        - The current naming convention is: 'FI - <basin> <type>'. 
-        - Specifically, the current script is set up for the following sheet names:
-            - 'FI - App Shale', 'FI - Alaska Offshore', 'FI - Anadarko Conv', 'FI - Anadarko Shale', 'FI - Anadarko Tight', 
-            'FI - Arkla Conv', 'FI - Arkla Shale', 'FI - Arkla Tight', 'FI - Arkoma Conv', 'FI - Arkoma Shale', 'FI - East Texas Conv', 
-            'FI - East Texas Shale', 'FI - East Texas Tight', 'FI - Fort Worth Shale', 'FI - Green River Conv', 'FI - Green River Tight', 
-            'FI - Gulf Conv', 'FI - Gulf Shale', 'FI - Gulf TIght', 'FI - Permian Conv', 'FI - Permian Shale', 'FI - Piceance Tight', 
-            'FI - San Juan CBM', 'FI - San Juan Shale', 'FI - South OK Shale', 'FI - Strawn Shale', 'FI - Uinta Conv', 'FI - Uinta Tight', 
-            'FI - GoM Offshore'
+        - The function is senstive to the naming convention of the regions in the excel file.
     """
-    # 0. Develop dictionary for basin, technobasins, and production shares
-    technobasins_basins = final_dictionary (technobasins_basins, excel_file_path, production_sheet_name)
-    print(technobasins_basins)
-
     final_table = pd.DataFrame()
 
-    # 1. Read excel file
-    input_data = pd.ExcelFile(excel_file_path)
-    sheet_names = input_data.sheet_names
-    sheet_names = [name for name in sheet_names if name.startswith("FI")]
-    sheet_names = sheet_names[1:] # Drop the US Average sheet
-
-    # Get unused ground and water emissions based on average US emissions "FI - US Average"
-    unused_ground_emissions, unused_water_emissions = get_unused_flows(excel_file_path, "FI - US Average")
+    # determine folder path containing the excel files
+    
+    # 1. Read excel files in the folder path containing the model
+    for filename in os.listdir(excel_folder_path):
+        if filename.endswith('.xlsx'):
+            file_path = os.path.join(excel_folder_path, filename)
+            logging.info(f"Reading file: {file_path}")
+            input_data = pd.ExcelFile(file_path)
+            sheet_names = input_data.sheet_names
+            sheet_name = [name for name in sheet_names if name in region_sheets_dict.values()][0]
 
-    for sheet in sheet_names:
         # Extract air, water, and ground emissions data for the selected sheet (i.e., technobasin)
-        air_emissions_data, water_emissions_data, ground_emissions_data = read_technobasin_data(excel_file_path, sheet)
+        air_emissions_data, water_emissions_data, ground_emissions_data = read_region_data(file_path, sheet_name)
         
         # Air emissions Get the correct flow names, compartment, and uuid for each flow
         full_air_emissions_data = correct_netl_flow_names(air_emissions_data, flow_mapping_path)
         full_air_emissions_data = full_air_emissions_data[full_air_emissions_data['FlowUUID'].notna()] # drop rows with FlowUUID NaN
         
-        # Water emissions - drop unused flows
-        if unused_water_emissions is not None:
-            for flow in unused_water_emissions['FlowName']:
-                water_emissions_data = water_emissions_data.drop(water_emissions_data[water_emissions_data['FlowName'] == flow].index)        
         # Water emissions - get the correct flow names, compartment, and uuid for each flow
         full_water_emissions_data = correct_netl_flow_names(water_emissions_data, flow_mapping_path)
         full_water_emissions_data = full_water_emissions_data[full_water_emissions_data['FlowUUID'].notna()] # drop rows with FlowUUID NaN
         
-        # Ground emissions - drop unused flows
-        if unused_ground_emissions is not None:
-            for flow in unused_ground_emissions['FlowName']:
-                ground_emissions_data = ground_emissions_data.drop(ground_emissions_data[ground_emissions_data['FlowName'] == flow].index)
         # Ground emissions - get the correct flow names, compartment, and uuid for each flow
         full_ground_emissions_data = correct_netl_flow_names(ground_emissions_data, flow_mapping_path)
         full_ground_emissions_data = full_ground_emissions_data[full_ground_emissions_data['FlowUUID'].notna()] # drop rows with FlowUUID NaN
@@ -415,13 +478,9 @@ def generate_lci(technobasins_basins, excel_file_path, flow_mapping_path, produc
         # combine dataframes
         df1 = pd.concat([full_air_emissions_data, full_water_emissions_data, full_ground_emissions_data])
         df1 = df1.sort_values(by='FlowUUID') # sort by FlowUUID
-        basin_name = find_basin (technobasins_basins, sheet)
+        region = [key for key, v in region_sheets_dict.items() if v == sheet_name][0]
         df1['FlowAmount'] = df1['FlowAmount'].astype(float)
         df1['FlowAmount'] = df1['FlowAmount'].fillna(0)
-        norm_value = get_normalized_values(technobasins_basins, sheet)
-        df1['norm'] = norm_value
-        df1['norm'] = df1['norm'].astype(float)
-        df1['normalized_emissions'] = df1['FlowAmount'] * df1['norm']
 
         # create final_table structure in 1st iteration
         if final_table.empty:
@@ -432,66 +491,25 @@ def generate_lci(technobasins_basins, excel_file_path, flow_mapping_path, produc
             final_table = final_table[['Compartment', 'FlowName', 'FlowUUID', 'Unit', 'flow_type', 'input']]
             final_table.columns = ['compartment', 'flow_name', 'uuid', 'unit', 'flow_type', 'is_input']
             # add a column for each basin
-            basins_columns = list (technobasins_basins.keys())
-            for basin in basins_columns:
-                final_table[basin] = 0
-        final_table.head()
-        final_table.shape
+            region_columns = list (region_sheets_dict.keys())
+            for r in region_columns:
+                final_table[r] = 0
         
-        # Compute normalized emissions and add to final table   
+        # add region emissions to final table   
         try:
-            final_table['normalized_emissions'] = df1['normalized_emissions'].values
-            final_table[basin_name] += final_table['normalized_emissions']
-            final_table = final_table.drop(columns=['normalized_emissions'])
+            logging.info(f"Adding emissions for {region}")
+            logging.info(f"df1: {df1['FlowAmount'].head(5)}")
+            final_table[region] = df1['FlowAmount']
         except Exception as e:
             sys.exit(f"Error reading sheet. Make sure your excel file follows the correct naming convention.For reference, refer to the source code, lines 70-78. Error: {e}")
 
-
     # 2. Save final table to excel
     save_ng_lci(final_table, final_table_name ,destination_path)
-    print(f"Final table saved to {destination_path}/{final_table_name}.csv")
+    print(f"Final table saved to {destination_path}/{final_table_name}.xlsx")
     
     return final_table
 
-def get_unused_flows(excel_file_path, sheet_name):
-    """
-    This function extracts the unused ground and water emissions from a given natural gas results dataset
-
-    Inputs:
-    - excel_file_path: path to the excel file
-    - sheet_name: name of the sheet to extract the data from
-
-    Outputs:
-    - unused_ground_emissions: dataframe containing the unused ground emissions
-    - unused_water_emissions: dataframe containing the unused water emissions
-    """
-    us_average_data = pd.read_excel(excel_file_path, sheet_name=sheet_name,skiprows=0,header=None) 
-    us_average_data.iloc[0] = us_average_data.iloc[0].ffill()
-    us_average_data.iloc[1] = us_average_data.iloc[1].ffill()
-    us_average_data.columns = us_average_data.iloc[2]
-    us_average_data = us_average_data.drop(columns=["P2.5", "P97.5"])
-    us_average_data.columns = us_average_data.iloc[0]
-    us_average_data = us_average_data.drop(us_average_data.index[0])
-    #extract ground data from us_average sheet
-    ground_emissions_data = us_average_data.iloc[:, [us_average_data.shape[1]-3, us_average_data.shape[1]-2]]
-    ground_emissions_data.columns.values[0] = "FlowName"
-    ground_emissions_data.columns.values[1] = "FlowAmount"
-    ground_emissions_data = ground_emissions_data.dropna()
-    ground_emissions_data = ground_emissions_data.iloc[1:]
-    #extract water data from us_average sheet
-    water_emissions_data = us_average_data.iloc[:, [us_average_data.shape[1]-3, us_average_data.shape[1]-1]]
-    water_emissions_data.columns.values[0] = "FlowName"
-    water_emissions_data.columns.values[1] = "FlowAmount"
-    water_emissions_data = water_emissions_data.iloc[2:]
-    water_emissions_data = water_emissions_data.dropna()
-    #unused ground emissions
-    unused_ground_emissions = ground_emissions_data[ground_emissions_data['FlowAmount'] == 0.00e+00]
-    #unused water emissions
-    unused_water_emissions = water_emissions_data[water_emissions_data['FlowAmount'] == 0.00e+00]
-
-    return unused_ground_emissions, unused_water_emissions
-
-def read_technobasin_data(excel_file_path, sheet_name):
+def read_region_data(excel_file_path, sheet_name):
     """
     This function reads an excel file, extracts the data, and generates a df for NG emissions for air, water, and ground.
     The df includes the flow name and flow amount (P2.5 and P97.5 values are dropped).
@@ -549,47 +567,6 @@ def read_technobasin_data(excel_file_path, sheet_name):
 
     return air_emissions_data, water_emissions_data, ground_emissions_data
 
-
-# Helper function to calculate normalized values for each technobasin
-def get_normalized_values(technobasins_basins, technobasin):
-    for outer, inner in technobasins_basins.items():
-        if technobasin in inner:
-            total = sum(inner.values())
-            return float(inner[technobasin] / total)
-    return None
-
-# helper function to find basins for a given technobasin
-def find_basin(technobasins_basins, technobasin_name):
-    for outer, inner in technobasins_basins.items():
-        if technobasin_name in inner:
-            return outer
-    return None
-
-# Helper function to use aliases to normalize technobasin naming
-def _normalize_technobasin_naming(name):
-    name_lower = name.lower().strip()
-    
-    # Check exact or partial match
-    for alias, canonical in aliases.items():
-        alias_clean = alias.lower()
-        if name_lower in alias_clean or alias_clean in name_lower:
-            return canonical
-
-# Helper function to create the final dictionary including basin, technobasin, and production share
-def final_dictionary(technobasins_basins, excel_file_path, production_sheet_name):
-    production_shares_2020 = pd.read_excel(excel_file_path, sheet_name=production_sheet_name)
-    production_shares_2020 = production_shares_2020.iloc[1:]
-    production_shares_2020['Scenario Normalized'] = production_shares_2020['Scenario'].apply(lambda x: _normalize_technobasin_naming(x))
-    production_shares_2020 = production_shares_2020.drop(columns=production_shares_2020.columns[0])
-    production_shares_2020.columns.values[1] = 'Scenario'
-    production_shares_2020 = production_shares_2020[['Scenario', 'Production Shares (%)']]
-    # final dictionary including basin, technobasin, and production share
-    technobasins_basins = {
-        key: {num: production_shares_2020.set_index('Scenario').loc[num, 'Production Shares (%)'] for num in nums}
-        for key, nums in technobasins_basins.items()
-    }
-    return technobasins_basins
-
 def save_ng_lci(df, filename, destination_path):
     """
     This function saves the final table to an excel file.

From 2726c60d16c2b802c76fe37f8c3815028cac9742 Mon Sep 17 00:00:00 2001
From: Francis Hanna <hannafra@msu.edu>
Date: Mon, 20 Oct 2025 14:14:40 -0400
Subject: [PATCH 3/6] Minor edits to natural_gas_upstream.py

---
 electricitylci/natural_gas_upstream.py | 37 ++++++++++++--------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/electricitylci/natural_gas_upstream.py b/electricitylci/natural_gas_upstream.py
index 92dd9a95..3c825ded 100644
--- a/electricitylci/natural_gas_upstream.py
+++ b/electricitylci/natural_gas_upstream.py
@@ -19,6 +19,7 @@
 from electricitylci.model_config import model_specs
 from electricitylci.utils import download_edx
 from electricitylci.globals import paths
+from electricitylci.utils import check_output_dir
 ##############################################################################
 # MODULE DOCUMENTATION
 ##############################################################################
@@ -61,8 +62,8 @@
     'MO':'Midwest','NE':'Midwest','SD':'Midwest','IL':'Midwest','IN':'Midwest','OH':'Midwest','WI':'Midwest','MI':'Midwest',
     'AR':'Southeast','LA':'Southeast','AL':'Southeast','FL':'Southeast','GA':'Southeast','MS':'Southeast','SC':'Southeast','KY':'Southeast',
     'NC':'Southeast','TN':'Southeast','VA':'Southeast','WV':'Southeast','DE':'Southeast','MD':'Southeast','CT':'Northeast','MA':'Northeast',
-    'NH':'Northeast','RI':'Northeast','VT':'Northeast','NJ':'Northeast','NY':'Northeast','PA':'Northeast','ME':'Northeast',
-} #TOTAL 48 -- EXCLUDING AL, HI, AND DC
+    'NH':'Northeast','RI':'Northeast','VT':'Northeast','NJ':'Northeast','NY':'Northeast','PA':'Northeast','ME':'Northeast', 'DC':'Northeast',
+} #TOTAL 48 -- EXCLUDING AK and HI
 
 ##############################################################################
 # MAN FUNCTION
@@ -153,7 +154,7 @@ def generate_upstream_ng(year):
         ng_lci_mapped["Compartment"].str.contains("Technosphere/"),
         "ElementaryFlowPrimeContext"] = "technosphere"
     # Issue #296 - adding DQI information for upstream processes
-    ng_lci_mapped["Year"] = 2016
+    ng_lci_mapped["Year"] = model_specs.ng_model_year
     ng_lci_mapped["DataReliability"] = 3
     ng_lci_mapped["TemporalCorrelation"] = add_temporal_correlation_score(
         ng_lci_mapped["Year"], model_specs.electricity_lci_target_year
@@ -369,8 +370,7 @@ def get_ng_lci(year):
     else:
         data_folder = os.path.join(paths.local_path, 'netl')
         # create new directory for ng if non existing
-        if not os.path.exists(os.path.join(data_folder,"2020_ng")):
-            os.makedirs(os.path.join(data_folder,"2020_ng"))
+        check_output_dir(os.path.join(data_folder,"2020_ng"))
         data_folder = os.path.join(data_folder,"2020_ng")
         # check if the ng_lci_2020rev1.csv already exists - if it does then we can skip all the below
         if os.path.exists(os.path.join(data_folder, "ng_lci_2020rev1.csv")):
@@ -385,21 +385,18 @@ def get_ng_lci(year):
             # this step will require downloading files from edx      
             # retrieve ng model
             # check if model is data_dir
-            if not os.path.exists(os.path.join(data_folder,"2020_ng_model")):
-                os.makedirs(os.path.join(data_folder,"2020_ng_model"))
-                model_folder = os.path.join(data_folder,"2020_ng_model")
-            else:
-                model_folder = os.path.join(data_folder,"2020_ng_model")
-                for ngmodel in r_ids_2020.keys():
-                    if os.path.exists(os.path.join(model_folder, ngmodel)):
-                        logging.info(f"{ngmodel} already exists in your data directory.")
-                    else:
-                        logging.info(f"Downloading {ngmodel} from EDx.")
-                        try:
-                            download_edx(resource_id = r_ids_2020[ngmodel], api_key = model_specs.edx_api_key, output_dir = model_folder)
-                        except Exception as e:
-                            logging.error(f"Error downloading {ngmodel} from EDx. Error: {e}")
-                            sys.exit(1)
+            check_output_dir(os.path.join(data_folder,"2020_ng_model"))
+            model_folder = os.path.join(data_folder,"2020_ng_model")
+            for ngmodel in r_ids_2020.keys():
+                if os.path.exists(os.path.join(model_folder, ngmodel)):
+                    logging.info(f"{ngmodel} already exists in your data directory.")
+                else:
+                    logging.info(f"Downloading {ngmodel} from EDx.")
+                    try:
+                        download_edx(resource_id = r_ids_2020[ngmodel], api_key = model_specs.edx_api_key, output_dir = model_folder)
+                    except Exception as e:
+                        logging.error(f"Error downloading {ngmodel} from EDx. Error: {e}")
+                        sys.exit(1)
             # retrieve flow mapping document from edx [elci.csv]
             # check if flowmapping csv exists in data_dir
             if os.path.exists(os.path.join(data_folder, "elci.csv")):

From e739bde68d09b98a1d5224e86ffc418007de617e Mon Sep 17 00:00:00 2001
From: Francis Hanna <hannafra@msu.edu>
Date: Fri, 7 Nov 2025 11:33:02 -0500
Subject: [PATCH 4/6] updated yaml documentation

---
 electricitylci/data/process_metadata.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/electricitylci/data/process_metadata.yml b/electricitylci/data/process_metadata.yml
index 270d265a..27b3bf37 100644
--- a/electricitylci/data/process_metadata.yml
+++ b/electricitylci/data/process_metadata.yml
@@ -860,12 +860,12 @@ gas_upstream:
     - 'The cradle-to-gate inventory for production of gas aggregated to basin. '
 
   techno_process: &gas_upstream_techno_process
-    - 'The NETL natural gas life cycle model includes parameters to generate inventories for natural gas extraction based on basin and geology which determines the gas extraction type (e.g., Appalachian Shale using hydraulic fracturing).
-      2016 natural gas production then informs the amount of each type of technology/region that form the mix in the regions.
+    - 'The NETL natural gas life cycle model includes parameters to generate inventories for natural gas extraction based on region or basin and geology which determines the gas extraction type (e.g., Appalachian Shale using hydraulic fracturing).
+      2016 or 2020 natural gas production then informs the amount of each type of technology/region that form the mix in the regions, depending on the year selected in the model configuration.
       These can be further aggregated to a US average.
       More details are in the natural gas upstream report at the link below
-
-      https://www.netl.doe.gov/energy-analysis/details?id=3198'
+      2016: https://www.netl.doe.gov/energy-analysis/details?id=4f43cb3f-c0d7-482e-bf01-39995a7c7497
+      2020: https://www.netl.doe.gov/energy-analysis/details?id=546d4009-c43b-43f5-bcc9-64d5e63fc8d5
 
   Description:
   - *gas_upstream_techno_intro

From 93ac54971357059c92c8b9c5e0735ea9b53bcc35 Mon Sep 17 00:00:00 2001
From: Francis Hanna <hannafra@msu.edu>
Date: Fri, 7 Nov 2025 11:49:02 -0500
Subject: [PATCH 5/6] yaml documentation quickfix

---
 electricitylci/data/process_metadata.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/electricitylci/data/process_metadata.yml b/electricitylci/data/process_metadata.yml
index 27b3bf37..f409add7 100644
--- a/electricitylci/data/process_metadata.yml
+++ b/electricitylci/data/process_metadata.yml
@@ -866,6 +866,7 @@ gas_upstream:
       More details are in the natural gas upstream report at the link below
       2016: https://www.netl.doe.gov/energy-analysis/details?id=4f43cb3f-c0d7-482e-bf01-39995a7c7497
       2020: https://www.netl.doe.gov/energy-analysis/details?id=546d4009-c43b-43f5-bcc9-64d5e63fc8d5
+      '
 
   Description:
   - *gas_upstream_techno_intro

From e7f5997f37cf91caa5a4f591bf421d1a864f8413 Mon Sep 17 00:00:00 2001
From: Francis Hanna <hannafra@msu.edu>
Date: Fri, 7 Nov 2025 14:40:05 -0500
Subject: [PATCH 6/6] process metadata - Yaml file minor edit

---
 electricitylci/data/process_metadata.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/electricitylci/data/process_metadata.yml b/electricitylci/data/process_metadata.yml
index f409add7..9cc8f040 100644
--- a/electricitylci/data/process_metadata.yml
+++ b/electricitylci/data/process_metadata.yml
@@ -857,15 +857,15 @@ coal_upstream:
 gas_upstream:
 
   techno_intro: &gas_upstream_techno_intro
-    - 'The cradle-to-gate inventory for production of gas aggregated to basin. '
+    - 'The cradle-to-gate inventory for production of gas aggregated to basin or region, depending on the year selected in the model configuration. '
 
   techno_process: &gas_upstream_techno_process
     - 'The NETL natural gas life cycle model includes parameters to generate inventories for natural gas extraction based on region or basin and geology which determines the gas extraction type (e.g., Appalachian Shale using hydraulic fracturing).
       2016 or 2020 natural gas production then informs the amount of each type of technology/region that form the mix in the regions, depending on the year selected in the model configuration.
       These can be further aggregated to a US average.
-      More details are in the natural gas upstream report at the link below
-      2016: https://www.netl.doe.gov/energy-analysis/details?id=4f43cb3f-c0d7-482e-bf01-39995a7c7497
-      2020: https://www.netl.doe.gov/energy-analysis/details?id=546d4009-c43b-43f5-bcc9-64d5e63fc8d5
+      More details are in the natural gas upstream report at the following links.
+      Link for 2016: https://www.netl.doe.gov/energy-analysis/details?id=4f43cb3f-c0d7-482e-bf01-39995a7c7497
+      Link for 2020: https://www.netl.doe.gov/energy-analysis/details?id=546d4009-c43b-43f5-bcc9-64d5e63fc8d5
       '
 
   Description: