From bd1ac69b6005a9491056feee5bdecd47f602f0ea Mon Sep 17 00:00:00 2001 From: willjayeo Date: Tue, 18 Nov 2025 18:52:40 +0000 Subject: [PATCH 1/5] Parse MSOA data --- census/nomis_census.py | 2 +- scripts/plot_cornish_identity.py | 72 ++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) create mode 100755 scripts/plot_cornish_identity.py diff --git a/census/nomis_census.py b/census/nomis_census.py index 7dcac40..5eb572c 100644 --- a/census/nomis_census.py +++ b/census/nomis_census.py @@ -132,7 +132,7 @@ def map_data_to_polygons( # Remove output areas that are not required if output_areas_to_keep is not None: - self.remove_polygons_from_gpkg(polygons_to_keep) + self.remove_polygons_from_gpkg(output_areas_to_keep) # Join the census data to the output area polygons gdf = gdf.merge( diff --git a/scripts/plot_cornish_identity.py b/scripts/plot_cornish_identity.py new file mode 100755 index 0000000..d1d1f4f --- /dev/null +++ b/scripts/plot_cornish_identity.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +""" + +Author: William Jay, November 2025 +""" + +import argparse + +from census.nomis_census import Census + +AUTHORITY_OF_INTEREST = "Cornwall" +AUTHORITY_FIELD = "MSOA21NM" +CSV_GEOMETRY = "Area" +GPKG_GEOMETRY = "MSOA21CD" + + +def main( + csv_path: str, + gpkg_path: str, + output_plot: str, + #variable: str, + #percent_of_variable: str = None, + output_areas_to_keep: list[str] = AUTHORITY_OF_INTEREST, + csv_geometry_field: str = CSV_GEOMETRY, + gpkg_geometry_field: str = GPKG_GEOMETRY, +): + """ + Middle Super Output Area polygons accessed from + https://www.data.gov.uk/dataset/677a5164-3a9e-4752-b8e6-5744d2b280ec/middle-layer-super-output-areas-december-2021-boundaries-ew-bgc-v3 + """ + + # Get Census object + census = Census(csv_path) + + # Map data onto output area polygons + census.map_data_to_polygons( + gpkg_path, csv_geometry_field, gpkg_geometry_field, output_areas_to_keep + ) + + # Calculate percent of variable + #if percent_of_variable is not None: + # + #` variable = census.calc_percent_of_variable(variable, percent_of_variable) + +if __name__ == "__main__": + helpstring = "" + parser = argparse.ArgumentParser( + description=helpstring, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--input_csv", + type=str, + required=True, + help="Unix-style glob to CSV files downloaded from NOMIS", + ) + parser.add_argument( + "--input_gpkg", + type=str, + required=True, + help="Path to GeoPackage file containing census Output Area features", + ) + parser.add_argument( + "--output_plot", + type=str, + required=True, + help="Path to write output plot to", + ) + + cmdline = parser.parse_args() + + main(cmdline.input_csv, cmdline.input_gpkg, cmdline.output_plot) From 9df2544438973aff34f48362e9393fd35f774340 Mon Sep 17 00:00:00 2001 From: willjayeo Date: Tue, 18 Nov 2025 22:01:44 +0000 Subject: [PATCH 2/5] Remove unwanted local authorities. Need to fix join --- census/nomis_census.py | 12 ++++++- scripts/plot_cornish_identity.py | 56 +++++++++++++++++++++++++++----- 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/census/nomis_census.py b/census/nomis_census.py index 5eb572c..784b997 100644 --- a/census/nomis_census.py +++ b/census/nomis_census.py @@ -133,7 +133,17 @@ def map_data_to_polygons( # Remove output areas that are not required if output_areas_to_keep is not None: self.remove_polygons_from_gpkg(output_areas_to_keep) - + + # Check that the datatypes of the join columns match + nomis_dtype = self.data[geometry_field_nomis].dtype + gpkg_dtype = gdf[geometry_field_gpkg].dtype + if nomis_dtype != gpkg_dtype: + raise ValueError( + "Datatype mismatch for join columns. NOMIS field " + f"'{geometry_field_nomis}' is '{nomis_dtype}' and GeoPackage field " + f"'{geometry_field_gpkg}' is '{gpkg_dtype}'" + ) + # Join the census data to the output area polygons gdf = gdf.merge( self.data, diff --git a/scripts/plot_cornish_identity.py b/scripts/plot_cornish_identity.py index d1d1f4f..f90395a 100755 --- a/scripts/plot_cornish_identity.py +++ b/scripts/plot_cornish_identity.py @@ -18,9 +18,8 @@ def main( csv_path: str, gpkg_path: str, output_plot: str, - #variable: str, - #percent_of_variable: str = None, - output_areas_to_keep: list[str] = AUTHORITY_OF_INTEREST, + authorities_to_keep: list[str] = AUTHORITY_OF_INTEREST, + authority_field: str = AUTHORITY_FIELD, csv_geometry_field: str = CSV_GEOMETRY, gpkg_geometry_field: str = GPKG_GEOMETRY, ): @@ -34,13 +33,54 @@ def main( # Map data onto output area polygons census.map_data_to_polygons( - gpkg_path, csv_geometry_field, gpkg_geometry_field, output_areas_to_keep + gpkg_path, csv_geometry_field, gpkg_geometry_field, authorities_to_keep ) - # Calculate percent of variable - #if percent_of_variable is not None: - # - #` variable = census.calc_percent_of_variable(variable, percent_of_variable) + # This dataset has MSOAs named with the prefix of 'msoa2021:' which needs removing + # in order to match the MSOA names in the GeoPackage. The following line removes + # characters to the left of and including ':' + census.data[csv_geometry_field] = ( + census.data[csv_geometry_field].str.split(":").str.get(1) + ) + + print(census.data["Area"]) + + # Remove unrequired authorities + census.mapped_data = remove_polygons_by_authority( + census.mapped_data, authorities_to_keep, authority_field + ) + + print(census.mapped_data["UK identity: Cornish only identity"].describe()) + + fields_of_interest = [ + "UK identity: British only identity", + "UK identity: English only identity", + "UK identity: English and British only identity", + "UK identity: Cornish only identity", + "UK identity: Cornish and British only identity", + ] + total_field = "Total: All usual residents" + # List to populate new field names with + percent_fields = [] + + for field in fields_of_interest: + + # Calculate field as a percentage of the total per output area + new_field_name = census.calc_percent_of_variable(field, total_field) + # Add new field name to list + percent_fields.append(new_field_name) + + +# TODO: Make this flexible and handle queries for multiple regions and add to Census class +def remove_polygons_by_authority( + gdf, authorities_to_keep: list[str], authority_field: str +): + """ + Remove polygons from self.output_areas GeoDataFrame by inputting a list of aurthority names to keep. + """ + + return gdf[gdf["MSOA21NM"].str.startswith("Cornwall")] + if __name__ == "__main__": helpstring = "" From 1fee881648d832c433dd8ab371c2ce8e2e9116f8 Mon Sep 17 00:00:00 2001 From: willjayeo Date: Wed, 19 Nov 2025 21:28:28 +0000 Subject: [PATCH 3/5] Bug fix, merge now works --- scripts/plot_cornish_identity.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/scripts/plot_cornish_identity.py b/scripts/plot_cornish_identity.py index f90395a..4f8653e 100755 --- a/scripts/plot_cornish_identity.py +++ b/scripts/plot_cornish_identity.py @@ -11,7 +11,7 @@ AUTHORITY_OF_INTEREST = "Cornwall" AUTHORITY_FIELD = "MSOA21NM" CSV_GEOMETRY = "Area" -GPKG_GEOMETRY = "MSOA21CD" +GPKG_GEOMETRY = "MSOA21NM" def main( @@ -31,11 +31,6 @@ def main( # Get Census object census = Census(csv_path) - # Map data onto output area polygons - census.map_data_to_polygons( - gpkg_path, csv_geometry_field, gpkg_geometry_field, authorities_to_keep - ) - # This dataset has MSOAs named with the prefix of 'msoa2021:' which needs removing # in order to match the MSOA names in the GeoPackage. The following line removes # characters to the left of and including ':' @@ -43,15 +38,16 @@ def main( census.data[csv_geometry_field].str.split(":").str.get(1) ) - print(census.data["Area"]) + # Map data onto output area polygons + census.map_data_to_polygons( + gpkg_path, csv_geometry_field, gpkg_geometry_field, authorities_to_keep + ) # Remove unrequired authorities census.mapped_data = remove_polygons_by_authority( census.mapped_data, authorities_to_keep, authority_field ) - print(census.mapped_data["UK identity: Cornish only identity"].describe()) - fields_of_interest = [ "UK identity: British only identity", "UK identity: English only identity", From 010608c258a2706d1e0ac1aa758ccad9843307ca Mon Sep 17 00:00:00 2001 From: willjayeo Date: Wed, 19 Nov 2025 22:24:24 +0000 Subject: [PATCH 4/5] Make choropleth of MSOA --- census/nomis_census.py | 44 ++++++++++++++++++++++---------- scripts/make_leaflet_map.py | 38 +++------------------------ scripts/plot_cornish_identity.py | 29 +++++++++------------ 3 files changed, 46 insertions(+), 65 deletions(-) diff --git a/census/nomis_census.py b/census/nomis_census.py index 784b997..888bee6 100644 --- a/census/nomis_census.py +++ b/census/nomis_census.py @@ -4,6 +4,8 @@ Author: William Jay, November 2025 """ +import folium + import geopandas as gpd import pandas as pd @@ -114,40 +116,34 @@ def map_data_to_polygons( gpkg_path: str, geometry_field_nomis: str = "2021 output area", geometry_field_gpkg: str = "OA21CD", - output_areas_to_keep: list[str] = None, ): """ Returns a geopandas DataFrame that contains geospatial polygons for each geographic entry such as output areas or super output areas. A GeoPackage file must be input containing these output areas at the resolution that matches the census data. - - If output_areas_to_keep is None (the default value), then all output areas are - kept. Otherwise all others are removed. Use this if you are only interested in a - specific region. """ # Open output areas as GeoDataFrame gdf = gpd.read_file(gpkg_path) - # Remove output areas that are not required - if output_areas_to_keep is not None: - self.remove_polygons_from_gpkg(output_areas_to_keep) + # Add geometry fields ass attributes + self.geometry_field_gpkg = geometry_field_gpkg # Check that the datatypes of the join columns match nomis_dtype = self.data[geometry_field_nomis].dtype - gpkg_dtype = gdf[geometry_field_gpkg].dtype + gpkg_dtype = gdf[self.geometry_field_gpkg].dtype if nomis_dtype != gpkg_dtype: raise ValueError( "Datatype mismatch for join columns. NOMIS field " f"'{geometry_field_nomis}' is '{nomis_dtype}' and GeoPackage field " - f"'{geometry_field_gpkg}' is '{gpkg_dtype}'" + f"'{self.geometry_field_gpkg}' is '{gpkg_dtype}'" ) # Join the census data to the output area polygons gdf = gdf.merge( self.data, - left_on=geometry_field_gpkg, + left_on=self.geometry_field_gpkg, right_on=geometry_field_nomis, how="left", ) @@ -158,9 +154,29 @@ def map_data_to_polygons( # Assign to attribute self.mapped_data = mapped_gdf - def remove_polygons_from_gpkg(self, polygons_to_keep: list[str]): + def create_choropleth_map( + self, + value_field: str, + output_map: str, + start_coords: list[float], + zoom_level: int, + ): """ - Remove polygons from self.output_areas GeoDataFrame + Create a Leaflet choropleth map using the folium.Choropleth method """ - pass + # Create map centered over Plymouth + folium_map = folium.Map(start_coords, zoom_start=zoom_level) + + # Create choropleth features + choropleth = folium.Choropleth( + geo_data=self.mapped_data, + data=self.mapped_data, + columns=[self.geometry_field_gpkg, value_field], + key_on=f"feature.properties.{self.geometry_field_gpkg}", + legend_name=value_field, + highlight=True, + ).add_to(folium_map) + + # Write as HTML + folium_map.save(output_map) diff --git a/scripts/make_leaflet_map.py b/scripts/make_leaflet_map.py index 8c0b55d..7a38b3b 100755 --- a/scripts/make_leaflet_map.py +++ b/scripts/make_leaflet_map.py @@ -6,7 +6,6 @@ """ import argparse -import folium import geopandas as gpd @@ -26,6 +25,8 @@ def main( percent_of_variable: str = None, csv_geometry_field: str = CSV_GEOMETRY, gpkg_geometry_field: str = GPKG_GEOMETRY, + start_coords: list[float] = PLYMOUTH_COORDS, + start_coords: int = ZOOM_LEVEL, ): """ Create a Leaflet HTML choropleth map using Folium for a chosen variable in a @@ -47,47 +48,16 @@ def main( # Calculate percent of variable if percent_of_variable is not None: - - variable = census.calc_percent_of_variable(variable, percent_of_variable) + + variable = census.calc_percent_of_variable(variable, percent_of_variable) # Create choropleth Leaflet map create_choropleth_map( - census.mapped_data, variable, - gpkg_geometry_field, output_map, ) -def create_choropleth_map( - gdf: gpd.GeoDataFrame, - value_field: str, - geometry_field: str, - output_map: str, - start_coords: list[float] = PLYMOUTH_COORDS, - zoom_level: int = ZOOM_LEVEL, -): - """ - Create a Leaflet choropleth map using the folium.Choropleth method - """ - - # Create map centered over Plymouth - folium_map = folium.Map(start_coords, zoom_start=zoom_level) - - # Create choropleth features - choropleth = folium.Choropleth( - geo_data=gdf, - data=gdf, - columns=[geometry_field, value_field], - key_on=f"feature.properties.{geometry_field}", - legend_name=value_field, - highlight=True, - ).add_to(folium_map) - - # Write as HTML - folium_map.save(output_map) - - if __name__ == "__main__": helpstring = "" parser = argparse.ArgumentParser( diff --git a/scripts/plot_cornish_identity.py b/scripts/plot_cornish_identity.py index 4f8653e..dd68921 100755 --- a/scripts/plot_cornish_identity.py +++ b/scripts/plot_cornish_identity.py @@ -39,14 +39,12 @@ def main( ) # Map data onto output area polygons - census.map_data_to_polygons( - gpkg_path, csv_geometry_field, gpkg_geometry_field, authorities_to_keep - ) + census.map_data_to_polygons(gpkg_path, csv_geometry_field, gpkg_geometry_field) - # Remove unrequired authorities - census.mapped_data = remove_polygons_by_authority( - census.mapped_data, authorities_to_keep, authority_field - ) + # Remove MSOAs that are not Cornwall + census.mapped_data = census.mapped_data[ + census.mapped_data["MSOA21NM"].str.startswith("Cornwall") + ] fields_of_interest = [ "UK identity: British only identity", @@ -66,16 +64,13 @@ def main( # Add new field name to list percent_fields.append(new_field_name) - -# TODO: Make this flexible and handle queries for multiple regions and add to Census class -def remove_polygons_by_authority( - gdf, authorities_to_keep: list[str], authority_field: str -): - """ - Remove polygons from self.output_areas GeoDataFrame by inputting a list of aurthority names to keep. - """ - - return gdf[gdf["MSOA21NM"].str.startswith("Cornwall")] + # Create simple choropleth leaflet map + census.create_choropleth_map( + "UK identity: Cornish only identity_percent", + output_plot, + [50.406, -4.848], + 9, + ) if __name__ == "__main__": From 2a6185f243336370f9298ad149d6a292f8f57836 Mon Sep 17 00:00:00 2001 From: willjayeo Date: Wed, 19 Nov 2025 22:29:24 +0000 Subject: [PATCH 5/5] Fixed leaflet script --- scripts/make_leaflet_map.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/scripts/make_leaflet_map.py b/scripts/make_leaflet_map.py index 7a38b3b..1e13274 100755 --- a/scripts/make_leaflet_map.py +++ b/scripts/make_leaflet_map.py @@ -26,7 +26,7 @@ def main( csv_geometry_field: str = CSV_GEOMETRY, gpkg_geometry_field: str = GPKG_GEOMETRY, start_coords: list[float] = PLYMOUTH_COORDS, - start_coords: int = ZOOM_LEVEL, + zoom_level: int = ZOOM_LEVEL, ): """ Create a Leaflet HTML choropleth map using Folium for a chosen variable in a @@ -52,10 +52,7 @@ def main( variable = census.calc_percent_of_variable(variable, percent_of_variable) # Create choropleth Leaflet map - create_choropleth_map( - variable, - output_map, - ) + census.create_choropleth_map(variable, output_map, start_coords, zoom_level) if __name__ == "__main__":