diff --git a/.github/workflows/metadata-catalog.yml b/.github/workflows/metadata-catalog.yml index 5594765..a5ccf50 100644 --- a/.github/workflows/metadata-catalog.yml +++ b/.github/workflows/metadata-catalog.yml @@ -17,7 +17,7 @@ concurrency: env: TABLE_NAME: layer_definitions - S3_BASE_PATH: s3://riverscapes-athena/riverscapes_metadata/layer_definitions + S3_BASE_PATH: s3://riverscapes-athena/riverscapes_metadata/layer_definitions_raw/0.8/ ATHENA_DATABASE: default ATHENA_RESULT_BUCKET: s3://riverscapes-athena-output/query-results/metadata # <-- ensure this exists AWS_REGION: us-west-2 diff --git a/DataExchangeScripts.code-workspace b/DataExchangeScripts.code-workspace index beee49d..986454e 100644 --- a/DataExchangeScripts.code-workspace +++ b/DataExchangeScripts.code-workspace @@ -42,5 +42,13 @@ "**/.venv/**": true, "**/__pycache__/**": true }, + "json.schemas": [ + { + "fileMatch": [ + "**/layer_definitions.json" + ], + "url": "https://xml.riverscapes.net/riverscapes_metadata/schema/layer_definitions.schema.json" + } + ] } } \ No newline at end of file diff --git a/pipelines/README-AthenaOrganization.md b/pipelines/README-AthenaOrganization.md new file mode 100644 index 0000000..d899ce4 --- /dev/null +++ b/pipelines/README-AthenaOrganization.md @@ -0,0 +1,25 @@ +# Riverscapes-Athena AWS Bucket Organization + +> ![Note] 2026-01-15 This file in `data-exchange-scripts` repository `\pipelines\athena_readme.md` + +Pipelines output for Athena should go in the Riverscapes-athena bucket in the data_exchange/ prefix. + +Suggested naming convention, partitions & organization: + +* folder for project type (machine name), assuming all data from the same type + +## Contents 2026-01-15 + +It's kind of a mess: + +* `2025conus-projects` - a "materialized view" so-to-speak, generated by a glue script daily, of project derived from other data in data_exchange folder +* project_types - a single handmade file used for tracking the waterfall of models in CONUS2025 RUNS +* projects - populated by dynamodb sync, I think +* riverscape_metrics - Oct 12 2025 scrape using `rme_to_athena\rme_to_athena_parquet.py` supplemented with more recent incremental update +* rs_metric_engine2 - Jan 15 enhancement of the riverscape_metrics parquet files with simplified geometry using `add_simplified_geom_pq.py` +* rs-context +* rsdynamics-metrics/ +* rsdynamics/ - maybe we should have grouped them +* table_column_defs -- old version, deprecated & to be removed soon +* table_column_defs_v2 -- ditto +* test-double-geom -- testing if can have 2 geometries in one parquet file (A: yes). to be removed diff --git a/pipelines/rme_to_athena/CHANGELOG.md b/pipelines/rme_to_athena/CHANGELOG.md new file mode 100644 index 0000000..39e29d3 --- /dev/null +++ b/pipelines/rme_to_athena/CHANGELOG.md @@ -0,0 +1,11 @@ +# RME to Athena Pipeline Changelog + +## 1.1 + +* Added: New Parquet files generated from geopackage now includes geometry_simplified column +* Changed: Metadata layer_definitions.json updated to 0.8 schema + +## 1.0 + +* Used for CONUS run scrape +* the parquet results were later augmented with a simplified geometry version using add_simplified_geom_pq.py (without going back to source data in data exchange) diff --git a/pipelines/rme_to_athena/__version__.py b/pipelines/rme_to_athena/__version__.py index 4802e90..f901408 100644 --- a/pipelines/rme_to_athena/__version__.py +++ b/pipelines/rme_to_athena/__version__.py @@ -1 +1 @@ -__version__ = "1.0" +__version__ = "1.1" diff --git a/pipelines/rme_to_athena/layer_definitions.json b/pipelines/rme_to_athena/layer_definitions.json index fada6be..61c4ddc 100644 --- a/pipelines/rme_to_athena/layer_definitions.json +++ b/pipelines/rme_to_athena/layer_definitions.json @@ -1,7 +1,6 @@ { "$schema": "https://xml.riverscapes.net/riverscapes_metadata/schema/layer_definitions.schema.json", - "authority_name": "rme_to_athena", - "tool_schema_version": "1.0.2", + "tool_schema_version": "1.0.3", "layers": [ { "layer_id": "raw_rme", @@ -11,7 +10,7 @@ "columns": [ { "name": "brat_capacity", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Existing Beaver Dam Capacity", "data_unit": "km^-1", "description": "The existing number of beaver dams per unit of riverscape length that the riverscape can support", @@ -23,7 +22,7 @@ }, { "name": "brat_hist_capacity", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historical Beaver Dam Capacity", "data_unit": "km^-1", "description": "The historic number of beaver dams per unit of riverscapes length that the riverscape could support based on modeled historic vegetation", @@ -35,7 +34,7 @@ }, { "name": "brat_risk", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Risks to Beaver Dam Restoration", "data_unit": "NA", "description": "Categorical values of risk that beaver dam building activity could pose to human infrastructure", @@ -47,7 +46,7 @@ }, { "name": "brat_opportunity", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Opportunities for Beaver Dam Restoration", "data_unit": "NA", "description": "Categorical values of beaver-related restoration and conservation opportunities", @@ -59,7 +58,7 @@ }, { "name": "brat_limitation", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Limiting Factors to Bever Dam Restoration", "data_unit": "NA", "description": "Categorical natural and anthropogenic limitations on beaver dam building", @@ -71,7 +70,7 @@ }, { "name": "brat_complex_size", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Existing Beaver Dam Complex Size", "data_unit": "count", "description": "The number of dams the section of riverscape could support", @@ -83,7 +82,7 @@ }, { "name": "brat_hist_complex_size", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historical Beaver Dam Complex Size", "data_unit": "count", "description": "The number of dams the section of riverscapes could historically support based on modeled historic vegetation", @@ -95,7 +94,7 @@ }, { "name": "dam_setting", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Dam Setting", "data_unit": "NA", "description": "Beaver dam setting (classic, steep, floodplain)", @@ -107,7 +106,7 @@ }, { "name": "ownership", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Ownership Code", "data_unit": "NA", "description": "Dominant ownership within the DGO", @@ -119,7 +118,7 @@ }, { "name": "state", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "US State", "data_unit": "NA", "description": "The dominant US state within the DGO", @@ -131,7 +130,7 @@ }, { "name": "county", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "US County", "data_unit": "NA", "description": "The dominant county within the DGO", @@ -143,7 +142,7 @@ }, { "name": "drainage_area", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Drainage Area", "data_unit": "km^2", "description": "The drainage area of the primary channel in the DGO", @@ -155,7 +154,7 @@ }, { "name": "watershed_id", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Watershed ID", "data_unit": "NA", "description": "The 10-digit USGS watershed Hydrologic Unit Code", @@ -167,7 +166,7 @@ }, { "name": "stream_name", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Stream Name", "data_unit": "NA", "description": "The name of the primary stream in the DGO", @@ -203,7 +202,7 @@ }, { "name": "stream_length", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Stream Length", "data_unit": "m", "description": "Sum of stream segment lengths (NHD flow lines) within the DGO", @@ -215,7 +214,7 @@ }, { "name": "waterbody_type", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Waterbody Type", "data_unit": "NA", "description": "The NHD FCode of waterbody within the DGO", @@ -227,7 +226,7 @@ }, { "name": "waterbody_extent", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Waterbody Extent", "data_unit": "m^2", "description": "The area of waterbody within the DGO", @@ -239,7 +238,7 @@ }, { "name": "ecoregion3", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Ecoregion 3", "data_unit": "NA", "description": "EPA Ecoregion Level 3", @@ -251,7 +250,7 @@ }, { "name": "ecoregion4", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Ecoregion 4", "data_unit": "NA", "description": "EPA Ecoregion Level 4", @@ -263,7 +262,7 @@ }, { "name": "elevation", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Elevation", "data_unit": "m", "description": "The minimum elevation in the DGO", @@ -275,7 +274,7 @@ }, { "name": "geology", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Geology", "data_unit": "NA", "description": "The dominant geology type within the DGO", @@ -287,7 +286,7 @@ }, { "name": "huc12", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "HUC12", "data_unit": "NA", "description": "The 12-digit USGS watershed Hydrologic Unit Code", @@ -299,7 +298,7 @@ }, { "name": "prim_channel_gradient", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Primary Channel Gradient", "data_unit": "%", "description": "The gradient of the primary channel in the DGO", @@ -311,7 +310,7 @@ }, { "name": "valleybottom_gradient", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Valley Bottom Gradient", "data_unit": "%", "description": "The gradient of the riverscape centerline", @@ -323,7 +322,7 @@ }, { "name": "rel_flow_length", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Relative Flow Length", "data_unit": "m/m", "description": "Sum of stream lengths in the DGO divided by length of valley centerline", @@ -371,7 +370,7 @@ }, { "name": "tribs_per_km", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Tributaries per Kilometer", "data_unit": "km^-1", "description": "The number of tributaries per unit of riverscape length.", @@ -383,7 +382,7 @@ }, { "name": "planform_sinuosity", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Planform Sinuosity", "data_unit": "m/m", "description": "Sum of river length divided by the valley bottom length within moving window.", @@ -395,7 +394,7 @@ }, { "name": "lowlying_area", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Low-lying Area", "data_unit": "m^2", "description": "The area of valley bottom classified as 'low_lying' within the DGO", @@ -407,7 +406,7 @@ }, { "name": "elevated_area", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Elevated Area", "data_unit": "m^2", "description": "The area of valley bottom classified as 'elevated' within the DGO", @@ -419,7 +418,7 @@ }, { "name": "channel_area", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Channel Area", "data_unit": "m^2", "description": "The area of active channel within the DGO.", @@ -431,7 +430,7 @@ }, { "name": "floodplain_area", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Floodplain Area", "data_unit": "m^2", "description": "The area of active floodplain within the DGO.", @@ -443,7 +442,7 @@ }, { "name": "integrated_width", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Integrated Width", "data_unit": "m", "description": "The width of the riverscape", @@ -455,7 +454,7 @@ }, { "name": "active_channel_ratio", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Active Channel Ratio", "data_unit": "m^2/m^2", "description": "The proportion of the riverscape that is active channel", @@ -467,7 +466,7 @@ }, { "name": "low_lying_ratio", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Low Lying Ratio", "data_unit": "m^2/m^2", "description": "The proportion of the riversape that is relatively low-lying surface", @@ -479,7 +478,7 @@ }, { "name": "elevated_ratio", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Elevated Ratio", "data_unit": "m^2/m^2", "description": "The proportion of the riverscape that is relatively elevated surface", @@ -491,7 +490,7 @@ }, { "name": "floodplain_ratio", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Floodplain Ratio", "data_unit": "m^2/m^2", "description": "proportion of the riverscape that is floodplain (low-lying + elevated; not channel)", @@ -503,7 +502,7 @@ }, { "name": "acres_vb_per_mile", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Acres Valley Bottom Per Mile", "data_unit": "acre/mile", "description": "Area of valley bottom per riverscape length", @@ -515,7 +514,7 @@ }, { "name": "hect_vb_per_km", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Hectares Valley Bottom Per Kilometer", "data_unit": "ha/km", "description": "Area of valley bottom per riverscape length", @@ -527,7 +526,7 @@ }, { "name": "channel_width", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Channel Width", "data_unit": "m", "description": "Width of the primary channel in the DGO", @@ -539,7 +538,7 @@ }, { "name": "confinement_ratio", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Confinement Ratio", "data_unit": "m/m", "description": "The ratio of channel length that abuts a confining margin to the total channel length", @@ -551,7 +550,7 @@ }, { "name": "constriction_ratio", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Constriction Ratio", "data_unit": "m/m", "description": "The ratio of channel length constricted by confining margins on both sides to total channel length", @@ -563,7 +562,7 @@ }, { "name": "confining_margins", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Confining Margins", "data_unit": "m", "description": "The length of confining margins within the DGO", @@ -575,7 +574,7 @@ }, { "name": "constricting_margins", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Constricting Margins", "data_unit": "m", "description": "The length of constricting margins (confining on both sides of the channel simultaneously) within the DGO", @@ -587,7 +586,7 @@ }, { "name": "qlow", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Low Flow", "data_unit": "ft^3/s", "description": "Low flow value of the primary channel in the DGO", @@ -599,7 +598,7 @@ }, { "name": "q2", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "High Flow", "data_unit": "ft^3/s", "description": "Typical flood (Q2) flow value of the primary channel in the DGO", @@ -611,7 +610,7 @@ }, { "name": "splow", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Low Stream Power", "data_unit": "W", "description": "Low flow stream power", @@ -623,7 +622,7 @@ }, { "name": "sphigh", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "High Stream Power", "data_unit": "W", "description": "Typical flood (Q2) stream power", @@ -635,7 +634,7 @@ }, { "name": "road_len", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Road Length", "data_unit": "m", "description": "Length of roads within the DGO", @@ -647,7 +646,7 @@ }, { "name": "road_dens", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Road Density", "data_unit": "m/m", "description": "Length of roads per length of riverscape centerline", @@ -659,7 +658,7 @@ }, { "name": "rail_len", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Rail Length", "data_unit": "m", "description": "Length of railroads within the DGO", @@ -671,7 +670,7 @@ }, { "name": "rail_dens", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Rail Density (m/m)", "data_unit": "m/m", "description": "Length of railroads per length of riverscape centerline", @@ -683,7 +682,7 @@ }, { "name": "land_use_intens", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Land Use Intensity", "data_unit": "NA", "description": "Land use intensity from 0 (None) to 100 (High) within the DGO", @@ -695,7 +694,7 @@ }, { "name": "road_dist", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Minimum Distance Between Channel and Roads", "data_unit": "m", "description": "The minimum distance between roads and river channels", @@ -707,7 +706,7 @@ }, { "name": "rail_dist", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Minimum Distance Between Channel and Railroads", "data_unit": "m", "description": "The minimum distance between railroads and river channels", @@ -719,7 +718,7 @@ }, { "name": "div_dist", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Minimum Distance Between Channel and Diversions", "data_unit": "m", "description": "The minimum distance between diversion points and river channels", @@ -731,7 +730,7 @@ }, { "name": "canal_dist", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Minimum Distance Between Channel and Canals", "data_unit": "m", "description": "The minimum distance between canals and river channels", @@ -743,7 +742,7 @@ }, { "name": "infra_dist", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Minimum Distance Between Channel and Infrastructure", "data_unit": "m", "description": "The minimum distance between any infrastructure and river channels", @@ -755,7 +754,7 @@ }, { "name": "fldpln_access", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Floodplain Accessibility", "data_unit": "m^2/m^2", "description": "The proportion of flooplain accessible to the channel (e.g., via lateral migration)", @@ -767,7 +766,7 @@ }, { "name": "access_fldpln_extent", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Floodplain Acessibility Extent", "data_unit": "m^2", "description": "The area of the accessible floodplain", @@ -779,7 +778,7 @@ }, { "name": "lf_evt", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Dominant Existing Vegetation Classes", "data_unit": "NA", "description": "The dominant Landfire EVT classes", @@ -791,7 +790,7 @@ }, { "name": "lf_bps", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Dominstant Historical Vegetation Classes", "data_unit": "NA", "description": "The dominant Landfire BpS classes", @@ -803,7 +802,7 @@ }, { "name": "lf_agriculture_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Proportion Agriculture", "data_unit": "m^2/m^2", "description": "The proportion of agricultural cover in the riverscape", @@ -815,7 +814,7 @@ }, { "name": "lf_agriculture", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Area Agriculture", "data_unit": "m^2", "description": "The area of agricultural cover in the riverscape", @@ -827,7 +826,7 @@ }, { "name": "lf_conifer_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Proprotion Coniferous", "data_unit": "m^2/m^2", "description": "The proportion of conifer cover in the riverscape", @@ -839,7 +838,7 @@ }, { "name": "lf_conifer", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Area Coniferous", "data_unit": "m^2", "description": "The area of conifer cover in the riverscape", @@ -851,7 +850,7 @@ }, { "name": "lf_conifer_hardwood_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Proportion Mixed Hardwood/Coniferous", "data_unit": "m^2/m^2", "description": "The proportion of mixed conifer-hardwood cover in the riverscape", @@ -863,7 +862,7 @@ }, { "name": "lf_conifer_hardwood", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Area Mixed Hardwood/Coniferous", "data_unit": "m^2", "description": "The area of mixed conifer-hardwood cover in the riverscape", @@ -875,7 +874,7 @@ }, { "name": "lf_developed_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Proportion Developed", "data_unit": "m^2/m^2", "description": "The proportion of the riverscape that is developed", @@ -887,7 +886,7 @@ }, { "name": "lf_developed", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Area Developed", "data_unit": "m^2", "description": "The area of the riverscape that is developed", @@ -899,7 +898,7 @@ }, { "name": "lf_exotic_herbaceous_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Proportion Exotic Herbacious", "data_unit": "m^2/m^2", "description": "The proportion exotic herbaceous cover in the riverscape", @@ -911,7 +910,7 @@ }, { "name": "lf_exotic_herbaceous", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Area Exotic Herbacious", "data_unit": "m^2", "description": "The area of exotic herbaceous cover in the riverscape", @@ -923,7 +922,7 @@ }, { "name": "lf_exotic_tree_shrub_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Proportion Exotic Trees or Shrubs", "data_unit": "m^2/m^2", "description": "The proportion of exotic tree and shrub cover in the riverscape", @@ -935,7 +934,7 @@ }, { "name": "lf_exotic_tree_shrub", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Area Exotic Tree or Shrub", "data_unit": "m^2", "description": "The area of exotic tree and shrub cover in the riverscape", @@ -947,7 +946,7 @@ }, { "name": "lf_grassland_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Proportion Grassland", "data_unit": "m^2/m^2", "description": "The proportion of grassland cover in the riverscape", @@ -959,7 +958,7 @@ }, { "name": "lf_grassland", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Area Grassland", "data_unit": "m^2", "description": "The area of grassland cover in the riverscape", @@ -971,7 +970,7 @@ }, { "name": "lf_hardwood_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Proportion Hardwood", "data_unit": "m^2/m^2", "description": "The proportion of hardwood forest cover in the riverscape", @@ -983,7 +982,7 @@ }, { "name": "lf_hardwood", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Area Hardwood", "data_unit": "m^2", "description": "The area of hardwood forest cover in the riverscape", @@ -995,7 +994,7 @@ }, { "name": "lf_riparian_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Proportion Riparian Vegetation", "data_unit": "m^2/m^2", "description": "The proportion of riparian vegetation cover in the riverscape", @@ -1007,7 +1006,7 @@ }, { "name": "lf_riparian", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Area Riparian Vegetation", "data_unit": "m^2", "description": "The area of riparian vegetation cover in the riverscape (from LandFire)", @@ -1019,7 +1018,7 @@ }, { "name": "lf_shrubland_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Proportion Shrubland", "data_unit": "m^2/m^2", "description": "The proportion of shrubland cover in the riverscape", @@ -1031,7 +1030,7 @@ }, { "name": "lf_shrubland", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Area Shrubland", "data_unit": "m^2", "description": "The area of shrubland cover in the riverscape", @@ -1043,7 +1042,7 @@ }, { "name": "lf_sparsely_vegetated_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Proportion Sparsely Vegetated", "data_unit": "m^2/m^2", "description": "The proportion of the riverscape that is sparsely vegetated", @@ -1055,7 +1054,7 @@ }, { "name": "lf_sparsely_vegetated", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Area Sparsely Vegetated", "data_unit": "m^2", "description": "The area of the riverscape that is sparsely vegetated", @@ -1067,7 +1066,7 @@ }, { "name": "lf_hist_conifer_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Proportion Coniferous", "data_unit": "m^2/m^2", "description": "The historic proportion of conifer cover in the riverscape", @@ -1079,7 +1078,7 @@ }, { "name": "lf_hist_conifer", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Area Coniferous", "data_unit": "m^2", "description": "The historic area of conifer cover in the riverscape", @@ -1091,7 +1090,7 @@ }, { "name": "lf_hist_conifer_hardwood_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Hisotric Proportion Mixed Coniferous/Hardwood", "data_unit": "m^2/m^2", "description": "The historic proportion of mixed conifer-hardwood forest in the riverscape", @@ -1103,7 +1102,7 @@ }, { "name": "lf_hist_conifer_hardwood", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Area Mixed Coniferous/Hardwood", "data_unit": "m^2", "description": "The historic area of mixed conifer-hardwood forest in the riverscape", @@ -1115,7 +1114,7 @@ }, { "name": "lf_hist_grassland_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Proportion Grassland", "data_unit": "m^2/m^2", "description": "The historic proportion of grassland in the riverscape", @@ -1127,7 +1126,7 @@ }, { "name": "lf_hist_grassland", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Area Grassland", "data_unit": "m^2", "description": "The historic area of grassland in the riverscape", @@ -1139,7 +1138,7 @@ }, { "name": "lf_hist_hardwood_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Proportion Hardwood", "data_unit": "m^2/m^2", "description": "The historic proportion of hardwood forest in the riverscape", @@ -1151,7 +1150,7 @@ }, { "name": "lf_hist_hardwood", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Area Hardwood", "data_unit": "m^2", "description": "The historic area of hardwood forest in the riverscape", @@ -1163,7 +1162,7 @@ }, { "name": "lf_hist_hardwood_conifer_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Proportion Mixed Hardwood/Conifer", "data_unit": "m^2/m^2", "description": "The historic proportion of mixed hardwood-conifer forest in the riverscape", @@ -1175,7 +1174,7 @@ }, { "name": "lf_hist_hardwood_conifer", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Area Mixed Hardwood/Conifer", "data_unit": "m^2", "description": "The historic area of mixed hardwood-conifer forest in the riverscape", @@ -1187,7 +1186,7 @@ }, { "name": "lf_hist_peatland_forest_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Proportion Peatland", "data_unit": "m^2/m^2", "description": "The historic proportion of peatland forest in the riverscape", @@ -1199,7 +1198,7 @@ }, { "name": "lf_hist_peatland_forest", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Area Peatland", "data_unit": "m^2", "description": "The historic area of peatland forest in the riverscape", @@ -1211,7 +1210,7 @@ }, { "name": "lf_hist_peatland_nonforest_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Proportion Peatland or Non-Forest", "data_unit": "m^2/m^2", "description": "The historic proportion of non-forested peatland in the riverscape", @@ -1223,7 +1222,7 @@ }, { "name": "lf_hist_peatland_nonforest", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Area Peatland or Non-Forest", "data_unit": "m^2", "description": "The historic area of non-forested peatland in the riverscape", @@ -1235,7 +1234,7 @@ }, { "name": "lf_hist_riparian_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Proportion Riparian Vegetation", "data_unit": "m^2/m^2", "description": "The historic proportion of riparian vegetation in the riverscape", @@ -1247,7 +1246,7 @@ }, { "name": "lf_hist_riparian", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Area Riparian Vegetation", "data_unit": "m^2", "description": "The historic area of riparian vegetation in the riverscape", @@ -1259,7 +1258,7 @@ }, { "name": "lf_hist_savanna_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Proportion Savanna", "data_unit": "m^2/m^2", "description": "The historic proportion of savanna in the riverscape", @@ -1271,7 +1270,7 @@ }, { "name": "lf_hist_savanna", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Area Savanna", "data_unit": "m^2", "description": "The historic area of savanna in the riverscape", @@ -1283,7 +1282,7 @@ }, { "name": "lf_hist_shrubland_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Proportion Shrubland", "data_unit": "m^2/m^2", "description": "The historic proportion of shrubland in the riverscape", @@ -1295,7 +1294,7 @@ }, { "name": "lf_hist_shrubland", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Area Shrubland", "data_unit": "m^2", "description": "The historic area of shrubland in the riverscape", @@ -1307,7 +1306,7 @@ }, { "name": "lf_hist_sparsely_vegetated_prop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Proportion of Sparse Vegetation", "data_unit": "m^2/m^2", "description": "The historic proportion of the riverscape that was sparsely vegetated", @@ -1319,7 +1318,7 @@ }, { "name": "lf_hist_sparsely_vegetated", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Area of Sparse Vegetation", "data_unit": "m^2", "description": "The historic area of the riverscape that was sparsely vegetated", @@ -1331,7 +1330,7 @@ }, { "name": "ex_riparian", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Area of Riparian Vegetation", "data_unit": "m^2", "description": "Area of existing riparian vegetation (as calculated by RCAT, which includes additonal landfire categories)", @@ -1343,7 +1342,7 @@ }, { "name": "hist_riparian", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Area of Riparian Vegetation", "data_unit": "m^2", "description": "Area of historic riparian vegetation (as calculated by RCAT, which includes additonal landfire categories)", @@ -1355,7 +1354,7 @@ }, { "name": "prop_riparian", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Proportion of Riparian Vegetation", "data_unit": "m^2/m^2", "description": "The area of existing riparian cover divided by the area of the riverscape", @@ -1367,7 +1366,7 @@ }, { "name": "hist_prop_riparian", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Historic Proportion of Riparian Vegetation", "data_unit": "m^2/m^2", "description": "The area of historic riparian cover divided by the area of the riverscape", @@ -1379,7 +1378,7 @@ }, { "name": "riparian_veg_departure", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Riparian Proportion Current over Historic", "data_unit": "m^2/m^2", "description": "Ratio of existing riparian vegetation cover over historic riparian vegetation cover. This is often expressed or symbolized as departure from historic which is 1 minus this value. Value of -9999 means valley too narrow to sample.", @@ -1391,7 +1390,7 @@ }, { "name": "ag_conversion", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Conversion from Riparian to Agriculture", "data_unit": "m^2/m^2", "description": "The proportion of historic riparian vegetation converted to agriculture", @@ -1403,7 +1402,7 @@ }, { "name": "develop", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Conversion from Riparian to Developed", "data_unit": "m^2/m^2", "description": "The proportion of historic riparian vegetation converted to developed", @@ -1415,7 +1414,7 @@ }, { "name": "grass_shrub_conversion", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Conversion from Riparian to Grass/Shrubland", "data_unit": "m^2/m^2", "description": "The proportion of historic riparian vegetation converted to grass-shrubland", @@ -1427,7 +1426,7 @@ }, { "name": "conifer_encroachment", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Conversion from Riparian to Conifer", "data_unit": "m^2/m^2", "description": "The proportion of historic riparian vegetation converted to conifer", @@ -1439,7 +1438,7 @@ }, { "name": "invasive_conversion", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Conversion from Native Riparian to Invasive", "data_unit": "m^2/m^2", "description": "The proportion of historic riparian vegetation converted to invasive vegetation", @@ -1451,7 +1450,7 @@ }, { "name": "riparian_condition", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Riparian Condition Index", "data_unit": "NA", "description": "A riparian condition index from (0) poor to (1) intact based on riparian vegetation departure, land use intensity, and floodplain accessibility", @@ -1462,20 +1461,29 @@ "default_value": null }, { - "name": "geom", - "dtype": "POINT", + "name": "dgo_geom", + "dtype": "GEOMETRY", "friendly_name": "DGO Geometry", "data_unit": "", "description": "Well-known-binary (WKB) representation of the DGO Polygon", "is_key": false, "is_required": false, - "theme": "DGOS", - "preferred_bin_definition": "", - "default_value": null + "theme": "DGOS" + }, + { + "name": "geometry_simplified", + "dtype": "GEOMETRY", + "friendly_name": "DGO Geometry", + "data_unit": "", + "description": "Well-known-binary (WKB) representation of the DGO Polygon simplified with a tolerance of 11 metres", + "is_key": false, + "is_required": false, + "theme": "DGOS" }, + { "name": "level_path", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "Level Path", "data_unit": "", "description": "the NHD level path, which is a unique identifier for each stream from its origin to its end in the NHDPlusHR dataset", @@ -1487,7 +1495,7 @@ }, { "name": "seg_distance", - "dtype": "MEDIUMINT", + "dtype": "INTEGER", "friendly_name": "Segment Distance", "data_unit": "", "description": "(From VBET) the distance along a given level path that the IGO is located (used for toplogical purposes)", @@ -1499,7 +1507,7 @@ }, { "name": "centerline_length", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Riverscape Length", "data_unit": "m", "description": "The length of valley centerline", @@ -1511,7 +1519,7 @@ }, { "name": "segment_area", - "dtype": "REAL", + "dtype": "FLOAT", "friendly_name": "Riverscape Area", "data_unit": "m^2", "description": "the area of the DGO polygon", @@ -1523,7 +1531,7 @@ }, { "name": "fcode", - "dtype": "MEDIUMINT", + "dtype": "INTEGER", "friendly_name": "Feature Code", "data_unit": "", "description": "The NHD Feature Code representing flow type of the primary channel", @@ -1535,7 +1543,7 @@ }, { "name": "rme_project_id", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "RME Project ID", "data_unit": "", "description": "Project ID for the source metric engine project on Riverscapes Data Exchange, data.riverscapes.net", @@ -1559,7 +1567,7 @@ }, { "name": "rme_version", - "dtype": "TEXT", + "dtype": "STRING", "friendly_name": "RME Project Version", "data_unit": "", "description": "RME Version as string, e.g. 1.3.4", @@ -1583,5 +1591,6 @@ } ] } - ] + ], + "tool_schema_name": "rme_to_athena" } \ No newline at end of file diff --git a/pipelines/rme_to_athena/rme_to_athena_parquet.py b/pipelines/rme_to_athena/rme_to_athena_parquet.py index c15970f..dc7b779 100644 --- a/pipelines/rme_to_athena/rme_to_athena_parquet.py +++ b/pipelines/rme_to_athena/rme_to_athena_parquet.py @@ -8,6 +8,7 @@ Enhances Philip's June 2025 rme_to_athena.py """ import argparse +import json import logging import os from pathlib import Path @@ -120,7 +121,7 @@ def get_matching_file(parent_dir: str, regex_str: str) -> str | None: def download_rme_geopackage( rs_api: RiverscapesAPI, project: RiverscapesProject, - huc_dir: str + huc_dir: str | Path ) -> str: """ Download the RME GeoPackage for a project and return its file path. @@ -132,6 +133,23 @@ def download_rme_geopackage( return rme_gpkg +def get_layer_columns_dict(layer_definitions_path: Path, layer_id: str) -> dict[str, dict]: + """ + Load the layer_definitions.json and return a dictionary of columns and their properties for the given layer_id. + Args: + layer_definitions_path (Path): the Path to the json file to use + layer_id (str): The layer_id to look up. + Returns: + dict[str, dict]: Dictionary mapping column names to their property dicts. + """ + with layer_definitions_path.open('r', encoding='utf-8') as f: + data = json.load(f) + for layer in data.get('layers', []): + if layer.get('layer_id') == layer_id: + return {col['name']: col for col in layer.get('columns', [])} + raise ValueError(f"Layer ID '{layer_id}' not found in {layer_definitions_path}") + + def extract_metrics_to_geodataframe(gpkg_path: str, spatialite_path: str) -> gpd.GeoDataFrame: """ Connect to the GeoPackage, run the SQL, and return a GeoDataFrame. @@ -169,11 +187,19 @@ def extract_metrics_to_geodataframe(gpkg_path: str, spatialite_path: str) -> gpd warnings.filterwarnings("ignore", message="pandas only supports SQLAlchemy connectable") df = pd.read_sql_query(sql, conn) + # Use the data dictionary to set column types. # because there are nulls, the combination of sqlites dynamic typing and pandas' type inference mis-assigns data types # actually the problem is that it is sometimes a double, sometimes INT64. Needs to be consistent - # TODO: this should look up the types from our data dictionary but i am hardcoding for now - for field in ['fcode', 'seg_distance', 'stream_order', 'headwater', 'confluences', 'diffluences', 'tributaries']: - df[field] = df[field].astype('Int64') # Note the capital 'I' for pandas nullable integer + try: + # NOTE - using this one definitions file to describe both INPUT AND OUTPUT structure + # ideally we'd take the data types from the RME data dictionary and write them (along with any changes we're making) to the raw_rme version. But that doesn't exist yet + # Possible enhancement: check the is_required property and if TRUE then we could use a nullable integer + columns_dict = get_layer_columns_dict(Path(__file__).parent / 'layer_definitions.json', 'raw_rme') + for field, props in columns_dict.items(): + if props.get('dtype') == 'INTEGER' and field in df.columns: + df[field] = df[field].astype('Int64') # pandas nullable integer + except Exception as e: + raise Exception(f"Could not apply data dictionary types: {e}") from e # Remove all columns named 'dgoid' (case-insensitive, even if duplicated) df = df.loc[:, [col for col in df.columns if col.lower() != 'dgoid']] @@ -181,6 +207,16 @@ def extract_metrics_to_geodataframe(gpkg_path: str, spatialite_path: str) -> gpd df['dgo_geom'] = df['dgo_geom'].apply(wkb.loads) # pyright: ignore[reportCallIssue, reportArgumentType] gdf = gpd.GeoDataFrame(df, geometry='dgo_geom', crs='EPSG:4326') + # Reproject to EPSG:5070 for simplification + gdf_proj = gdf.to_crs(epsg=5070) + + # Use simplify_coverage for topology-preserving simplification + gdf["geometry_simplified"] = gdf_proj.geometry.simplify_coverage(tolerance=11) # 11 m seems to have worked well + # Reproject simplified geometry back to EPSG:4326 + gdf["geometry_simplified"] = gpd.GeoSeries(gdf["geometry_simplified"], crs=5070).to_crs(epsg=4326) + gdf = gdf.set_crs(epsg=4326) + gdf = gdf.reset_index(drop=True) + bbox_df = gdf.geometry.bounds.rename(columns={'minx': 'xmin', 'miny': 'ymin', 'maxx': 'xmax', 'maxy': 'ymax'}) # Combine into a struct-like dict for each row gdf['dgo_geom_bbox'] = bbox_df.apply( @@ -191,10 +227,10 @@ def extract_metrics_to_geodataframe(gpkg_path: str, spatialite_path: str) -> gpd return gdf -def delete_folder(dirpath: str) -> None: +def delete_folder(dirpath: Path) -> None: """delete a local folder and its contents""" log = Logger('delete downloads') - if os.path.isdir(dirpath): + if dirpath.is_dir(): try: log.info(f'Deleting directory {dirpath}') shutil.rmtree(dirpath) @@ -204,7 +240,7 @@ def delete_folder(dirpath: str) -> None: def upload_to_s3( - file_path: str, + file_path: str | Path, s3_bucket: str, s3_key: str ) -> None: @@ -235,20 +271,21 @@ def scrape_rme( # 2. For each project: # - create a folder # - Download and validate - # - Extract metrics as GeoDataFrame + # - Extract metrics, geometries as GeoDataFrame # - Write GeoParquet # - Upload to S3 # - Optionally clean up log = Logger('Scrape RME') - + download_dir = Path(download_dir) # NEW WAY # run Athena query to find all eligible projects that are newer than what is already scraped projects_to_add_df = query_to_dataframe(missing_projects_query, 'identify new projects') if projects_to_add_df.empty: log.info("Query to identify projects to scrape returned no results.") return - + # test a single project + # projects_to_add_df = pd.DataFrame({'project_id': ['5aeff0f8-5a8e-4db8-8e6c-9e507b20eca0']}) count = 0 prg = ProgressBar(projects_to_add_df.shape[0], text="Scrape Progress") for project_id in projects_to_add_df['project_id']: @@ -267,8 +304,8 @@ def scrape_rme( model_version_int = semver_to_int(project.model_version) try: - huc_dir = os.path.join(download_dir, project.huc) - safe_makedirs(huc_dir) + huc_dir = download_dir / project.huc + safe_makedirs(str(huc_dir)) gpkg_path = download_rme_geopackage(rs_api, project, huc_dir) data_gdf = extract_metrics_to_geodataframe(gpkg_path, spatialite_path) # add common project-level columns @@ -279,12 +316,12 @@ def scrape_rme( log.debug(f"Dataframe prepared with shape {data_gdf.shape}") # until we have a more robust schema check this is something - if len(data_gdf.columns) != 134: - log.warning(f"Expected 134 columns, got {len(data_gdf.columns)}") - rme_pq_filepath = os.path.join(huc_dir, f'rme_{project.huc}.parquet') + if len(data_gdf.columns) != 135: + log.warning(f"Expected 135 columns, got {len(data_gdf.columns)}") + rme_pq_filepath = huc_dir / f'rme_{project.huc}.parquet' data_gdf.to_parquet(rme_pq_filepath) - # don't use os.path.join because this is aws os, not system os - s3_key = f'data_exchange/riverscape_metrics/{os.path.basename(rme_pq_filepath)}' + # do not use os.path.join because this is aws os, not system os + s3_key = f'data_exchange/riverscape_metrics/{rme_pq_filepath.name}' upload_to_s3(rme_pq_filepath, data_bucket, s3_key) if delete_downloads_when_done: @@ -307,10 +344,11 @@ def main(): args = dotenv.parse_args_env(parser) # Set up some reasonable folders to store things - working_folder = args.working_folder - download_folder = os.path.join(working_folder, 'downloads') + working_folder = Path(args.working_folder) + download_folder = working_folder / 'downloads' + + safe_makedirs(str(working_folder)) - safe_makedirs(working_folder) log = Logger('Setup') log.setup(log_path=os.path.join(working_folder, 'rme-athena.log'), log_level=logging.DEBUG) diff --git a/pipelines/rscontext_to_athena/layer_definitions.json b/pipelines/rscontext_to_athena/layer_definitions.json index 3791aea..d7e0bbc 100644 --- a/pipelines/rscontext_to_athena/layer_definitions.json +++ b/pipelines/rscontext_to_athena/layer_definitions.json @@ -1,479 +1,498 @@ { - "$schema": "https://xml.riverscapes.net/riverscapes_metadata/schema/layer_definitions.schema.json", - "authority_name": "rscontext_to_athena", - "tool_schema_version": "1.0.2", - "layers": [ - { - "layer_id": "rs_context_huc10", - "layer_name": "rs_context_huc10", - "description": "", - "layer_type": "DataTable", - "columns": [ - { - "name": "huc", - "dtype": "TEXT", - "friendly_name": "HUC10", - "data_unit": "NA", - "description": "HUC10 Watershed ID", - "is_key": true, - "is_required": true, - "theme": "", - "preferred_bin_definition": "", - "default_value": null - }, - { - "name": "hucName", - "dtype": "TEXT", - "friendly_name": "Watershed Name", - "data_unit": "NA", - "description": "", - "is_key": false, - "is_required": false, - "theme": "", - "default_value": null - }, - { - "name": "hucStates", - "dtype": "TEXT", - "friendly_name": "State(s)", - "data_unit": "", - "description": "", - "theme": "" - }, - { - "name": "hucAreaSqKm", - "dtype": "REAL", - "friendly_name": "Watershed Area", - "data_unit": "km^2", - "description": "", - "is_key": false, - "is_required": false, - "theme": "", - "default_value": null - }, - { - "name": "hucAreaAcres", - "dtype": "REAL", - "friendly_name": "Watershed Area (acres)", - "data_unit": "acre", - "description": "", - "is_key": false, - "is_required": false, - "theme": "", - "default_value": null - }, - { - "name": "flowlineLengthPerennialKm", - "dtype": "REAL", - "friendly_name": "Perrenial Stream Length", - "data_unit": "km" - }, - { - "name": "flowlineLengthIntermittentKm", - "dtype": "REAL", - "friendly_name": "Intermittent Stream Length", - "data_unit": "km" - }, - { - "name": "flowlineLengthEphemeralKm", - "dtype": "REAL", - "friendly_name": "Ephemeral Stream Length", - "data_unit": "km" - }, - { - "name": "flowlineLengthCanalsKm", - "dtype": "REAL", - "friendly_name": "Canal Length", - "data_unit": "km" - }, - { - "name": "flowlineLengthAllKm", - "dtype": "REAL", - "friendly_name": "Total Stream Length", - "data_unit": "km" - }, - { - "name": "flowlineFeatureCount", - "dtype": "INTEGER", - "friendly_name": "Number of Stream Segments", - "data_unit": "count" - }, - { - "name": "waterbodyAreaSqKm", - "dtype": "REAL", - "friendly_name": "Total Waterbody Area", - "data_unit": "km^2" - }, - { - "name": "waterbodyFeatureCount", - "dtype": "INTEGER", - "friendly_name": "Total count of Waterbodies", - "data_unit": "count" - }, - { - "name": "waterbodyLakesPondsAreaSqKm", - "dtype": "REAL", - "friendly_name": "Lake Area", - "data_unit": "km^2" - }, - { - "name": "waterbodyReservoirAreaSqKm", - "dtype": "REAL", - "friendly_name": "Reservoir Area", - "data_unit": "km^2" - }, - { - "name": "waterbodyEstuariesAreaSqKm", - "dtype": "REAL", - "friendly_name": "Estuaries Area", - "data_unit": "km^2" - }, - { - "name": "waterbodyPlayaAreaSqKm", - "dtype": "REAL", - "friendly_name": "Playa Area", - "data_unit": "km^2" - }, - { - "name": "waterbodySwampMarshAreaSqKm", - "dtype": "REAL", - "friendly_name": "Swamp/Marsh Area", - "data_unit": "km^2" - }, - { - "name": "waterbodyIceSnowAreaSqKm", - "dtype": "REAL", - "friendly_name": "Ice/Snow Area", - "data_unit": "km^2" - }, - { - "name": "demMaximum", - "dtype": "REAL", - "friendly_name": "DEM Max", - "data_unit": "m", - "description": "Maximum Elevation", - "theme": "Elevation" - }, - { - "name": "demMinimum", - "dtype": "REAL", - "friendly_name": "DEM Min", - "data_unit": "m", - "description": "Minimum Elevation", - "theme": "Elevation" - }, - { - "name": "demSum", - "dtype": "REAL", - "data_unit": "m", - "friendly_name": "DEM Spatial Sum of Grid Values", - "description": "Sum of individual grid cell values for elevation. Meaningless on its own, this is useful for calculating spatial average.", - "theme": "Elevation" - }, - { - "name": "demCount", - "dtype": "INTEGER", - "friendly_name": "Number of DEM Grid Cells", - "data_unit": "count", - "description": "Count of individual grid cells used in aggregations. Meaningless on its own, this is useful for calculating spatial average.", - "theme": "Elevation" - }, - { - "name": "demMean", - "dtype": "REAL", - "friendly_name": "DEM Spatial Mean", - "data_unit": "m", - "description": "Mean elevation across the geography.", - "theme": "Elevation" - }, - { - "name": "demRange", - "dtype": "REAL", - "friendly_name": "DEM Spatial Range", - "data_unit": "m", - "description": "Spatial variability, ie difference between Max and Min elevation across subject area.", - "theme": "Elevation" - }, - { - "name": "slopeMaximum", - "dtype": "REAL", - "friendly_name": "Slope Max", - "data_unit": "degree", - "description": "Maximum Slope", - "theme": "Slope" - }, - { - "name": "slopeMinimum", - "dtype": "REAL", - "friendly_name": "Slope Min", - "data_unit": "degree", - "description": "Minimum Slope", - "theme": "Slope" - }, - { - "name": "slopeSum", - "dtype": "REAL", - "data_unit": "degree", - "friendly_name": "Slope Spatial Sum of Grid Values", - "description": "Sum of individual grid cell values for slope. Meaningless on its own, this is useful for calculating spatial average.", - "theme": "Slope" - }, - { - "name": "slopeCount", - "dtype": "INTEGER", - "friendly_name": "Number of Slope Grid Cells", - "data_unit": "count", - "description": "Count of individual grid cells used in aggregations. Meaningless on its own, this is useful for calculating spatial average.", - "theme": "Slope" - }, - { - "name": "slopeMean", - "dtype": "REAL", - "friendly_name": "Slope Spatial Mean", - "data_unit": "degree", - "description": "Mean slope across the geography.", - "theme": "Slope" - }, - { - "name": "slopeRange", - "dtype": "REAL", - "friendly_name": "Slope Spatial Range", - "data_unit": "degree", - "description": "Spatial variability, ie difference between Max and Min slope across subject area.", - "theme": "Slope" - }, - { - "name": "precipMaximum", - "dtype": "REAL", - "friendly_name": "Precipitation Spatial Max", - "data_unit": "mm", - "description": "Maximum 30-year-average annual precipitation", - "theme": "PRISM Climate" - }, - { - "name": "precipMinimum", - "dtype": "REAL", - "friendly_name": "Precipitation Spatial Min", - "data_unit": "mm", - "description": "Minimum 30-year-average annual precipation", - "theme": "PRISM Climate" - }, - { - "name": "precipSum", - "dtype": "REAL", - "data_unit": "mm", - "friendly_name": "Precipitation Spatial Sum of Grid Values", - "description": "Sum of individual 800-meter grid cell values for 30-year-average annual precipitation. Meaningless on its own, this is useful for calculating spatial average.", - "theme": "PRISM Climate" - }, - { - "name": "precipCount", - "dtype": "INTEGER", - "friendly_name": "Number of Precipitation Grid Cells", - "data_unit": "count", - "description": "Count of individual 800-meter grid cell values used in aggregations. Meaningless on its own, this is useful for calculating spatial average.", - "theme": "PRISM Climate" - }, - { - "name": "precipMean", - "dtype": "REAL", - "friendly_name": "Precipitation Spatial Mean", - "data_unit": "mm", - "description": "Mean across the geography of the 30-year average annual precipitation.", - "theme": "PRISM Climate" - }, - { - "name": "precipRange", - "dtype": "REAL", - "friendly_name": "Precipitation Spatial Range", - "data_unit": "mm", - "description": "Spatial variability (difference between Max and Min precipitation areas) in 30-year average precipitation.", - "theme": "PRISM Climate" - }, - { - "name": "catchmentLength", - "dtype": "REAL", - "friendly_name": "Catchment Length", - "data_unit": "km", - "description": "", - "theme": "Morphometric" - }, - { - "name": "catchmentArea", - "dtype": "REAL", - "friendly_name": "Catchment Area", - "data_unit": "km^2", - "description": "", - "theme": "Morphometric" - }, - { - "name": "catchmentPerimeter", - "dtype": "REAL", - "friendly_name": "Catchment Perimeter", - "data_unit": "km", - "description": "", - "theme": "Morphometric" - }, - { - "name": "circularityRatio", - "dtype": "REAL", - "friendly_name": "Circularity Ratio", - "data_unit": "dimensionless", - "description": "Calculated as `catchment_area / bounding_circle_area` (both in same units)", - "theme": "Morphometric" - }, - { - "name": "elongationRatio", - "dtype": "REAL", - "friendly_name": "Elongation Ratio", - "data_unit": "dimensionless", - "description": "Calculated as `catchment_area_km2**0.5 / catchment_length_km`", - "theme": "Morphometric" - }, - { - "name": "formFactor", - "dtype": "REAL", - "friendly_name": "Form Factor", - "data_unit": "dimensionless", - "description": "Calculated as `catchment_area_km2 / catchment_length_km**2`", - "theme": "Morphometric" - }, - { - "name": "catchmentRelief", - "dtype": "REAL", - "friendly_name": "Catchment Relief", - "data_unit": "m", - "description": "The difference between highest and lowest elevation. Equivalent to DEM Range", - "theme": "Morphometric" - }, - { - "name": "reliefRatio", - "dtype": "REAL", - "friendly_name": "Relief Ratio", - "data_unit": "dimensionless", - "description": "Calculated as relief (in km) divided by catchment length (in km).", - "theme": "Morphometric" - }, - { - "name": "dem_bins.min", - "dtype": "REAL", - "friendly_name": "Lowest Elevation", - "data_unit": "m", - "description": "Minimum elevation value within the watershed’s DEM raster.", - "is_key": false, - "is_required": false, - "theme": "", - "preferred_bin_definition": "", - "default_value": null - }, - { - "name": "dem_bins.max", - "dtype": "REAL", - "friendly_name": "Highest Elevation", - "description": "Maximum elevation value within the watershed’s DEM raster.", - "data_unit": "m", - "is_key": false, - "is_required": false, - "theme": "", - "preferred_bin_definition": "", - "default_value": null - }, - { - "name": "dem_bins.geotransform", - "dtype": "ARRAY", - "friendly_name": "Raster Geotransform", - "data_unit": "NA", - "description": "Affine transformation coefficients for converting raster coordinates to spatial coordinates.", - "is_key": false, - "is_required": false, - "theme": "", - "preferred_bin_definition": "", - "default_value": null - }, - { - "name": "dem_bins.proj", - "dtype": "TEXT", - "friendly_name": "Raster Projection (WKT)", - "data_unit": "NA", - "description": "Well-Known Text (WKT) string describing the spatial reference system of the DEM raster.", - "is_key": false, - "is_required": false, - "theme": "", - "preferred_bin_definition": "", - "default_value": null - }, - { - "name": "dem_bins.nodata", - "dtype": "REAL", - "friendly_name": "NoData Value", - "data_unit": "NA", - "description": "Value used in the DEM raster to represent missing or undefined data.", - "is_key": false, - "is_required": false, - "theme": "", - "default_value": null - }, - { - "name": "dem_bins.value_count", - "dtype": "INTEGER", - "friendly_name": "Valid Cell Count", - "data_unit": "count", - "description": "Number of raster cells with valid elevation values (excluding NoData)", - "is_key": false, - "is_required": false, - "theme": "", - "preferred_bin_definition": "", - "default_value": null - }, - { - "name": "dem_bins.hist_type", - "dtype": "TEXT", - "friendly_name": "Histogram Type", - "data_unit": "NA", - "description": "Method used to bin elevation values. Continuous means bins are of equal width.", - "is_key": false, - "is_required": false, - "theme": "", - "default_value": null - }, - { - "name": "dem_bins.bin_size", - "dtype": "INTEGER", - "friendly_name": "Bin Size", - "data_unit": "m", - "description": "Width of each elevation bin used in the hypsometric analysis", - "theme": "", - "default_value": null - }, - { - "name": "dem_bins.bins.bin", - "dtype": "INTEGER", - "friendly_name": "Elevation Bin Value", - "data_unit": "m", - "description": "Lower bound of the bin", - "is_key": false, - "is_required": false, - "theme": "", - "default_value": null - }, - { - "name": "dem_bins.bins.cell_count", - "dtype": "INTEGER", - "friendly_name": "Cell Count", - "data_unit": "count", - "description": "Number of raster cells falling within elevation bin", - "is_key": false, - "is_required": false, - "theme": "", - "default_value": null - }, - { - "name": "existing_veg_bins", - "dtype": "STRUCT", - "friendly_name": "Existing Vegetation Bins", - "description": "Categorical bins extracted from the LandFire Existing Vegetation Type (EVT) raster. Complexes of plant communities representing NatureServe's terrestrial Ecological Systems classification." - } - ] + "$schema": "https://xml.riverscapes.net/riverscapes_metadata/schema/layer_definitions.schema.json", + "tool_schema_name": "rscontext_to_athena", + "tool_schema_version": "1.0.2", + "layers": [ + { + "layer_id": "rs_context_huc10", + "layer_name": "rs_context_huc10", + "description": "", + "layer_type": "DataTable", + "columns": [ + { + "name": "huc", + "dtype": "STRING", + "friendly_name": "HUC10", + "data_unit": "NA", + "description": "HUC10 Watershed ID", + "is_key": true, + "is_required": true, + "theme": "", + "preferred_bin_definition": "", + "default_value": null + }, + { + "name": "hucName", + "dtype": "STRING", + "friendly_name": "Watershed Name", + "data_unit": "NA", + "description": "", + "is_key": false, + "is_required": false, + "theme": "", + "default_value": null + }, + { + "name": "hucStates", + "dtype": "STRING", + "friendly_name": "State(s)", + "data_unit": "", + "description": "", + "theme": "" + }, + { + "name": "hucAreaSqKm", + "dtype": "FLOAT", + "friendly_name": "Watershed Area", + "data_unit": "km^2", + "description": "", + "is_key": false, + "is_required": false, + "theme": "", + "default_value": null + }, + { + "name": "hucAreaAcres", + "dtype": "FLOAT", + "friendly_name": "Watershed Area (acres)", + "data_unit": "acre", + "description": "", + "is_key": false, + "is_required": false, + "theme": "", + "default_value": null + }, + { + "name": "flowlineLengthPerennialKm", + "dtype": "FLOAT", + "friendly_name": "Perrenial Stream Length", + "data_unit": "km" + }, + { + "name": "flowlineLengthIntermittentKm", + "dtype": "FLOAT", + "friendly_name": "Intermittent Stream Length", + "data_unit": "km" + }, + { + "name": "flowlineLengthEphemeralKm", + "dtype": "FLOAT", + "friendly_name": "Ephemeral Stream Length", + "data_unit": "km" + }, + { + "name": "flowlineLengthCanalsKm", + "dtype": "FLOAT", + "friendly_name": "Canal Length", + "data_unit": "km" + }, + { + "name": "flowlineLengthAllKm", + "dtype": "FLOAT", + "friendly_name": "Total Stream Length", + "data_unit": "km" + }, + { + "name": "flowlineFeatureCount", + "dtype": "INTEGER", + "friendly_name": "Number of Stream Segments", + "data_unit": "count" + }, + { + "name": "waterbodyAreaSqKm", + "dtype": "FLOAT", + "friendly_name": "Total Waterbody Area", + "data_unit": "km^2" + }, + { + "name": "waterbodyFeatureCount", + "dtype": "INTEGER", + "friendly_name": "Total count of Waterbodies", + "data_unit": "count" + }, + { + "name": "waterbodyLakesPondsAreaSqKm", + "dtype": "FLOAT", + "friendly_name": "Lake Area", + "data_unit": "km^2" + }, + { + "name": "waterbodyReservoirAreaSqKm", + "dtype": "FLOAT", + "friendly_name": "Reservoir Area", + "data_unit": "km^2" + }, + { + "name": "waterbodyEstuariesAreaSqKm", + "dtype": "FLOAT", + "friendly_name": "Estuaries Area", + "data_unit": "km^2" + }, + { + "name": "waterbodyPlayaAreaSqKm", + "dtype": "FLOAT", + "friendly_name": "Playa Area", + "data_unit": "km^2" + }, + { + "name": "waterbodySwampMarshAreaSqKm", + "dtype": "FLOAT", + "friendly_name": "Swamp/Marsh Area", + "data_unit": "km^2" + }, + { + "name": "waterbodyIceSnowAreaSqKm", + "dtype": "FLOAT", + "friendly_name": "Ice/Snow Area", + "data_unit": "km^2" + }, + { + "name": "demMaximum", + "dtype": "FLOAT", + "friendly_name": "DEM Max", + "data_unit": "m", + "description": "Maximum Elevation", + "theme": "Elevation" + }, + { + "name": "demMinimum", + "dtype": "FLOAT", + "friendly_name": "DEM Min", + "data_unit": "m", + "description": "Minimum Elevation", + "theme": "Elevation" + }, + { + "name": "demSum", + "dtype": "FLOAT", + "data_unit": "m", + "friendly_name": "DEM Spatial Sum of Grid Values", + "description": "Sum of individual grid cell values for elevation. Meaningless on its own, this is useful for calculating spatial average.", + "theme": "Elevation" + }, + { + "name": "demCount", + "dtype": "INTEGER", + "friendly_name": "Number of DEM Grid Cells", + "data_unit": "count", + "description": "Count of individual grid cells used in aggregations. Meaningless on its own, this is useful for calculating spatial average.", + "theme": "Elevation" + }, + { + "name": "demMean", + "dtype": "FLOAT", + "friendly_name": "DEM Spatial Mean", + "data_unit": "m", + "description": "Mean elevation across the geography.", + "theme": "Elevation" + }, + { + "name": "demRange", + "dtype": "FLOAT", + "friendly_name": "DEM Spatial Range", + "data_unit": "m", + "description": "Spatial variability, ie difference between Max and Min elevation across subject area.", + "theme": "Elevation" + }, + { + "name": "slopeMaximum", + "dtype": "FLOAT", + "friendly_name": "Slope Max", + "data_unit": "degree", + "description": "Maximum Slope", + "theme": "Slope" + }, + { + "name": "slopeMinimum", + "dtype": "FLOAT", + "friendly_name": "Slope Min", + "data_unit": "degree", + "description": "Minimum Slope", + "theme": "Slope" + }, + { + "name": "slopeSum", + "dtype": "FLOAT", + "data_unit": "degree", + "friendly_name": "Slope Spatial Sum of Grid Values", + "description": "Sum of individual grid cell values for slope. Meaningless on its own, this is useful for calculating spatial average.", + "theme": "Slope" + }, + { + "name": "slopeCount", + "dtype": "INTEGER", + "friendly_name": "Number of Slope Grid Cells", + "data_unit": "count", + "description": "Count of individual grid cells used in aggregations. Meaningless on its own, this is useful for calculating spatial average.", + "theme": "Slope" + }, + { + "name": "slopeMean", + "dtype": "FLOAT", + "friendly_name": "Slope Spatial Mean", + "data_unit": "degree", + "description": "Mean slope across the geography.", + "theme": "Slope" + }, + { + "name": "slopeRange", + "dtype": "FLOAT", + "friendly_name": "Slope Spatial Range", + "data_unit": "degree", + "description": "Spatial variability, ie difference between Max and Min slope across subject area.", + "theme": "Slope" + }, + { + "name": "precipMaximum", + "dtype": "FLOAT", + "friendly_name": "Precipitation Spatial Max", + "data_unit": "mm", + "description": "Maximum 30-year-average annual precipitation", + "theme": "PRISM Climate" + }, + { + "name": "precipMinimum", + "dtype": "FLOAT", + "friendly_name": "Precipitation Spatial Min", + "data_unit": "mm", + "description": "Minimum 30-year-average annual precipation", + "theme": "PRISM Climate" + }, + { + "name": "precipSum", + "dtype": "FLOAT", + "data_unit": "mm", + "friendly_name": "Precipitation Spatial Sum of Grid Values", + "description": "Sum of individual 800-meter grid cell values for 30-year-average annual precipitation. Meaningless on its own, this is useful for calculating spatial average.", + "theme": "PRISM Climate" + }, + { + "name": "precipCount", + "dtype": "INTEGER", + "friendly_name": "Number of Precipitation Grid Cells", + "data_unit": "count", + "description": "Count of individual 800-meter grid cell values used in aggregations. Meaningless on its own, this is useful for calculating spatial average.", + "theme": "PRISM Climate" + }, + { + "name": "precipMean", + "dtype": "FLOAT", + "friendly_name": "Precipitation Spatial Mean", + "data_unit": "mm", + "description": "Mean across the geography of the 30-year average annual precipitation.", + "theme": "PRISM Climate" + }, + { + "name": "precipRange", + "dtype": "FLOAT", + "friendly_name": "Precipitation Spatial Range", + "data_unit": "mm", + "description": "Spatial variability (difference between Max and Min precipitation areas) in 30-year average precipitation.", + "theme": "PRISM Climate" + }, + { + "name": "catchmentLength", + "dtype": "FLOAT", + "friendly_name": "Catchment Length", + "data_unit": "km", + "description": "", + "theme": "Morphometric" + }, + { + "name": "catchmentArea", + "dtype": "FLOAT", + "friendly_name": "Catchment Area", + "data_unit": "km^2", + "description": "", + "theme": "Morphometric" + }, + { + "name": "catchmentPerimeter", + "dtype": "FLOAT", + "friendly_name": "Catchment Perimeter", + "data_unit": "km", + "description": "", + "theme": "Morphometric" + }, + { + "name": "circularityRatio", + "dtype": "FLOAT", + "friendly_name": "Circularity Ratio", + "data_unit": "dimensionless", + "description": "Calculated as `catchment_area / bounding_circle_area` (both in same units)", + "theme": "Morphometric" + }, + { + "name": "elongationRatio", + "dtype": "FLOAT", + "friendly_name": "Elongation Ratio", + "data_unit": "dimensionless", + "description": "Calculated as `catchment_area_km2**0.5 / catchment_length_km`", + "theme": "Morphometric" + }, + { + "name": "formFactor", + "dtype": "FLOAT", + "friendly_name": "Form Factor", + "data_unit": "dimensionless", + "description": "Calculated as `catchment_area_km2 / catchment_length_km**2`", + "theme": "Morphometric" + }, + { + "name": "catchmentRelief", + "dtype": "FLOAT", + "friendly_name": "Catchment Relief", + "data_unit": "m", + "description": "The difference between highest and lowest elevation. Equivalent to DEM Range", + "theme": "Morphometric" + }, + { + "name": "reliefRatio", + "dtype": "FLOAT", + "friendly_name": "Relief Ratio", + "data_unit": "dimensionless", + "description": "Calculated as relief (in km) divided by catchment length (in km).", + "theme": "Morphometric" + }, + { + "name": "dem_bins.min", + "dtype": "FLOAT", + "friendly_name": "Lowest Elevation", + "data_unit": "m", + "description": "Minimum elevation value within the watershed’s DEM raster.", + "is_key": false, + "is_required": false, + "theme": "", + "preferred_bin_definition": "", + "default_value": null + }, + { + "name": "dem_bins.max", + "dtype": "FLOAT", + "friendly_name": "Highest Elevation", + "description": "Maximum elevation value within the watershed’s DEM raster.", + "data_unit": "m", + "is_key": false, + "is_required": false, + "theme": "", + "preferred_bin_definition": "", + "default_value": null + }, + { + "name": "dem_bins.geotransform", + "dtype": "STRUCTURED", + "friendly_name": "Raster Geotransform", + "data_unit": "NA", + "description": "Affine transformation coefficients for converting raster coordinates to spatial coordinates.", + "is_key": false, + "is_required": false, + "theme": "", + "preferred_bin_definition": "", + "default_value": null + }, + { + "name": "dem_bins.proj", + "dtype": "STRING", + "friendly_name": "Raster Projection (WKT)", + "data_unit": "NA", + "description": "Well-Known Text (WKT) string describing the spatial reference system of the DEM raster.", + "is_key": false, + "is_required": false, + "theme": "", + "preferred_bin_definition": "", + "default_value": null + }, + { + "name": "dem_bins.nodata", + "dtype": "FLOAT", + "friendly_name": "NoData Value", + "data_unit": "NA", + "description": "Value used in the DEM raster to represent missing or undefined data.", + "is_key": false, + "is_required": false, + "theme": "", + "default_value": null + }, + { + "name": "dem_bins.value_count", + "dtype": "INTEGER", + "friendly_name": "Valid Cell Count", + "data_unit": "count", + "description": "Number of raster cells with valid elevation values (excluding NoData)", + "is_key": false, + "is_required": false, + "theme": "", + "preferred_bin_definition": "", + "default_value": null + }, + { + "name": "dem_bins.hist_type", + "dtype": "STRING", + "friendly_name": "Histogram Type", + "data_unit": "NA", + "description": "Method used to bin elevation values. Continuous means bins are of equal width.", + "is_key": false, + "is_required": false, + "theme": "", + "default_value": null + }, + { + "name": "dem_bins.bin_size", + "dtype": "INTEGER", + "friendly_name": "Bin Size", + "data_unit": "m", + "description": "Width of each elevation bin used in the hypsometric analysis", + "theme": "", + "default_value": null + }, + { + "name": "dem_bins.bins.bin", + "dtype": "INTEGER", + "friendly_name": "Elevation Bin Value", + "data_unit": "m", + "description": "Lower bound of the bin", + "is_key": false, + "is_required": false, + "theme": "", + "default_value": null + }, + { + "name": "dem_bins.bins.cell_count", + "dtype": "INTEGER", + "friendly_name": "Cell Count", + "data_unit": "count", + "description": "Number of raster cells falling within elevation bin", + "is_key": false, + "is_required": false, + "theme": "", + "default_value": null + }, + { + "name": "existing_veg_bins", + "dtype": "STRUCTURED", + "friendly_name": "Existing Vegetation Bins", + "description": "Categorical bins extracted from the LandFire Existing Vegetation Type (EVT) raster. Complexes of plant communities representing NatureServe's terrestrial Ecological Systems classification." + }, + { + "name": "ownership", + "dtype": "STRUCTURED", + "description": "A mapping of ownership codes to area" + }, + { + "name": "ownership[key]", + "friendly_name": "ownership_code", + "dtype": "STRING", + "data_unit": "NA", + "description": "Abbreviated ownership code" + }, + { + "name": "ownership[value]", + "friendly_name": "ownership_area", + "dtype": "FLOAT", + "data_unit": "m^2", + "description": "Area" } - ] + ] + } + ] } \ No newline at end of file diff --git a/pipelines/rsdynamics_to_athena/layer_definitions.json b/pipelines/rsdynamics_to_athena/layer_definitions.json index d0f049f..6cb93de 100644 --- a/pipelines/rsdynamics_to_athena/layer_definitions.json +++ b/pipelines/rsdynamics_to_athena/layer_definitions.json @@ -1,205 +1,205 @@ { - "$schema": "https://xml.riverscapes.net/riverscapes_metadata/schema/layer_definitions.schema.json", - "authority_name": "rsdynamics_to_athena", - "tool_schema_version": "1.0.0", - "layers": [ - { - "layer_id": "rsdynamics", - "layer_name": "RSDynamics DGOs", - "description": "Riverscapes Dynamics Project DGO attributes from VBET", - "columns": [ - { - "name": "dgo_id", - "friendly_name": "DGO ID", - "dtype": "bigint" - }, - { - "name": "level_path", - "friendly_name": "Level Path", - "dtype": "string" - }, - { - "name": "seg_distance", - "friendly_name": "Segment Distance", - "dtype": "bigint" - }, - { - "name": "fcode", - "friendly_name": "Feature Code", - "dtype": "bigint" - }, - { - "name": "low_lying_floodplain_area", - "friendly_name": "Low-Lying Floodplain Area", - "dtype": "double", - "data_unit": "m**2" - }, - { - "name": "low_lying_floodplain_prop", - "friendly_name": "Low-Lying Floodplain Proportion", - "dtype": "double", - "data_unit": "dimensionless", - "description": "Ratio of low-lying floodplain to DGO area" - }, - { - "name": "active_channel_area", - "friendly_name": "Active Channel Area", - "dtype": "double", - "data_unit": "m**2" - }, - { - "name": "active_channel_prop", - "friendly_name": "Active Channel Proportion", - "dtype": "double", - "data_unit": "dimensionless", - "description": "Ratio of active channel to DGO area" - }, - { - "name": "elevated_floodplain_area", - "friendly_name": "Elevated Floodplain Area", - "dtype": "double", - "data_unit": "m**2" - }, - { - "name": "elevated_floodplain_prop", - "friendly_name": "Elevated Floodplain Proportion", - "dtype": "double", - "data_unit": "dimensionless", - "description": "Ratio of elevated floodplain to DGO area" - }, - { - "name": "floodplain_area", - "friendly_name": "Floodplain Area", - "dtype": "double", - "data_unit": "m**2" - }, - { - "name": "floodplain_prop", - "friendly_name": "Floodplain Proportion", - "dtype": "double" - }, - { - "name": "centerline_length", - "friendly_name": "Centerline Length", - "dtype": "double", - "data_unit": "m" - }, - { - "name": "segment_area", - "friendly_name": "Segment Area", - "dtype": "double", - "data_unit": "m**2" - }, - { - "name": "integrated_width", - "friendly_name": "Integrated Width", - "dtype": "double", - "data_unit": "m" - }, - { - "name": "dgo_geom", - "friendly_name": "DGO Geometry", - "dtype": "binary" - }, - { - "name": "dgo_geom_bbox", - "friendly_name": "DGO Geometry Bounding Box", - "dtype": "struct" - }, - { - "name": "rd_project_id", - "friendly_name": "RSDynamics Project ID", - "dtype": "string", - "data_unit": "NA" - }, - { - "name": "rd_date_created_ts", - "friendly_name": "RSDynamics Project Created Timestamp", - "dtype": "bigint" - }, - { - "name": "rd_version", - "friendly_name": "RSDynamics Model Version", - "dtype": "string" - }, - { - "name": "rd_version_int", - "friendly_name": "RSDynamics Model Version (Int)", - "dtype": "bigint" - }, - { - "name": "huc", - "friendly_name": "HUC", - "dtype": "string" - } - ] - }, - { - "layer_id": "rsdynamics_metrics", - "layer_name": "RSDynamics Metrics", - "description": "Riverscapes Dynamics Project Metrics data for each DGO, landcover, epoch. Join with rsdynamics on rd_project_id + dgo_id.", - "columns": [ - { - "name": "dgo_id", - "friendly_name": "DGO ID", - "dtype": "bigint", - "data_unit": "NA" - }, - { - "name": "landcover", - "friendly_name": "Landcover", - "dtype": "string" - }, - { - "name": "epoch_length", - "friendly_name": "Epoch Length", - "dtype": "string" - }, - { - "name": "epoch_name", - "friendly_name": "Epoch Name", - "dtype": "string" - }, - { - "name": "confidence", - "friendly_name": "Confidence", - "dtype": "string" - }, - { - "name": "area", - "friendly_name": "Area", - "dtype": "double", - "data_unit": "m**2" - }, - { - "name": "areapc", - "friendly_name": "Area Percent", - "dtype": "double", - "data_unit": "%" - }, - { - "name": "width", - "friendly_name": "Width", - "dtype": "double", - "data_unit": "m" - }, - { - "name": "widthpc", - "friendly_name": "Width Percent", - "dtype": "double", - "data_unit": "%" - }, - { - "name": "huc", - "friendly_name": "HUC", - "dtype": "string" - }, - { - "name": "rd_project_id", - "friendly_name": "RSDynamics Project ID", - "dtype": "string" - } - ] + "$schema": "https://xml.riverscapes.net/riverscapes_metadata/schema/layer_definitions.schema.json", + "tool_schema_name": "rsdynamics_to_athena", + "tool_schema_version": "1.0.0", + "layers": [ + { + "layer_id": "rsdynamics", + "layer_name": "RSDynamics DGOs", + "description": "Riverscapes Dynamics Project DGO attributes from VBET", + "columns": [ + { + "name": "dgo_id", + "friendly_name": "DGO ID", + "dtype": "INTEGER" + }, + { + "name": "level_path", + "friendly_name": "Level Path", + "dtype": "STRING" + }, + { + "name": "seg_distance", + "friendly_name": "Segment Distance", + "dtype": "INTEGER" + }, + { + "name": "fcode", + "friendly_name": "Feature Code", + "dtype": "INTEGER" + }, + { + "name": "low_lying_floodplain_area", + "friendly_name": "Low-Lying Floodplain Area", + "dtype": "FLOAT", + "data_unit": "m**2" + }, + { + "name": "low_lying_floodplain_prop", + "friendly_name": "Low-Lying Floodplain Proportion", + "dtype": "FLOAT", + "data_unit": "dimensionless", + "description": "Ratio of low-lying floodplain to DGO area" + }, + { + "name": "active_channel_area", + "friendly_name": "Active Channel Area", + "dtype": "FLOAT", + "data_unit": "m**2" + }, + { + "name": "active_channel_prop", + "friendly_name": "Active Channel Proportion", + "dtype": "FLOAT", + "data_unit": "dimensionless", + "description": "Ratio of active channel to DGO area" + }, + { + "name": "elevated_floodplain_area", + "friendly_name": "Elevated Floodplain Area", + "dtype": "FLOAT", + "data_unit": "m**2" + }, + { + "name": "elevated_floodplain_prop", + "friendly_name": "Elevated Floodplain Proportion", + "dtype": "FLOAT", + "data_unit": "dimensionless", + "description": "Ratio of elevated floodplain to DGO area" + }, + { + "name": "floodplain_area", + "friendly_name": "Floodplain Area", + "dtype": "FLOAT", + "data_unit": "m**2" + }, + { + "name": "floodplain_prop", + "friendly_name": "Floodplain Proportion", + "dtype": "FLOAT" + }, + { + "name": "centerline_length", + "friendly_name": "Centerline Length", + "dtype": "FLOAT", + "data_unit": "m" + }, + { + "name": "segment_area", + "friendly_name": "Segment Area", + "dtype": "FLOAT", + "data_unit": "m**2" + }, + { + "name": "integrated_width", + "friendly_name": "Integrated Width", + "dtype": "FLOAT", + "data_unit": "m" + }, + { + "name": "dgo_geom", + "friendly_name": "DGO Geometry", + "dtype": "BINARY" + }, + { + "name": "dgo_geom_bbox", + "friendly_name": "DGO Geometry Bounding Box", + "dtype": "STRUCTURED" + }, + { + "name": "rd_project_id", + "friendly_name": "RSDynamics Project ID", + "dtype": "STRING", + "data_unit": "NA" + }, + { + "name": "rd_date_created_ts", + "friendly_name": "RSDynamics Project Created Timestamp", + "dtype": "INTEGER" + }, + { + "name": "rd_version", + "friendly_name": "RSDynamics Model Version", + "dtype": "STRING" + }, + { + "name": "rd_version_int", + "friendly_name": "RSDynamics Model Version (Int)", + "dtype": "INTEGER" + }, + { + "name": "huc", + "friendly_name": "HUC", + "dtype": "STRING" + } + ] + }, + { + "layer_id": "rsdynamics_metrics", + "layer_name": "RSDynamics Metrics", + "description": "Riverscapes Dynamics Project Metrics data for each DGO, landcover, epoch. Join with rsdynamics on rd_project_id + dgo_id.", + "columns": [ + { + "name": "dgo_id", + "friendly_name": "DGO ID", + "dtype": "INTEGER", + "data_unit": "NA" + }, + { + "name": "landcover", + "friendly_name": "Landcover", + "dtype": "STRING" + }, + { + "name": "epoch_length", + "friendly_name": "Epoch Length", + "dtype": "STRING" + }, + { + "name": "epoch_name", + "friendly_name": "Epoch Name", + "dtype": "STRING" + }, + { + "name": "confidence", + "friendly_name": "Confidence", + "dtype": "STRING" + }, + { + "name": "area", + "friendly_name": "Area", + "dtype": "FLOAT", + "data_unit": "m**2" + }, + { + "name": "areapc", + "friendly_name": "Area Percent", + "dtype": "FLOAT", + "data_unit": "%" + }, + { + "name": "width", + "friendly_name": "Width", + "dtype": "FLOAT", + "data_unit": "m" + }, + { + "name": "widthpc", + "friendly_name": "Width Percent", + "dtype": "FLOAT", + "data_unit": "%" + }, + { + "name": "huc", + "friendly_name": "HUC", + "dtype": "STRING" + }, + { + "name": "rd_project_id", + "friendly_name": "RSDynamics Project ID", + "dtype": "STRING" } - ] + ] + } + ] } \ No newline at end of file diff --git a/pydex/graphql/mutations/changeProjectOwner.graphql b/pydex/graphql/mutations/changeProjectOwner.graphql index 9fa98e9..cf37636 100644 --- a/pydex/graphql/mutations/changeProjectOwner.graphql +++ b/pydex/graphql/mutations/changeProjectOwner.graphql @@ -18,9 +18,6 @@ mutation changeProjectOwner($projectId: ID!, $owner: OwnerInput!) { ownedBy { ...owner } - sponsor { - ...owner - } starred starredCount tags diff --git a/pydex/graphql/queries/getProjectFull.graphql b/pydex/graphql/queries/getProjectFull.graphql index fcdff87..5fd2fcf 100644 --- a/pydex/graphql/queries/getProjectFull.graphql +++ b/pydex/graphql/queries/getProjectFull.graphql @@ -43,9 +43,6 @@ query getProjectFull($id: ID!) { } } } - sponsor { - ...dbobjSm - } collections(offset: 0, limit: 50) { items { ...dbobjSm diff --git a/pyproject.toml b/pyproject.toml index bd802a4..d95328a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "riverscapes-metadata @ git+https://github.com/Riverscapes/RiverscapesXML.git@master#subdirectory=riverscapes_metadata", "awswrangler>=3.10.1", "pandas>=2.3.3", + "tqdm>=4.67.1", ] [project.optional-dependencies] diff --git a/scripts/geo/add_simplified_geom_pq.py b/scripts/geo/add_simplified_geom_pq.py new file mode 100644 index 0000000..5224b7c --- /dev/null +++ b/scripts/geo/add_simplified_geom_pq.py @@ -0,0 +1,100 @@ +"""Add simplified geometry column to existing geo-parquet file +1. download file from s3 +2. process it, generating new file +3. upload that to a different prefix in s3 +""" +import logging +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor, as_completed + +import geopandas as gpd +import boto3 +from tqdm import tqdm # trying this instead of ProgressBar, I've heard good things + +from rsxml import Logger +from rme_to_athena_parquet import upload_to_s3 + +DEFAULT_DATA_BUCKET = "riverscapes-athena" +DATA_ROOT = Path(r"F:\nardata\work\rme_extraction") + + +def download_s3_file(s3_bucket: str, s3_key: str, local_file_path: Path): + """Download a file from S3 to a local path.""" + s3 = boto3.client('s3') + local_file_path.parent.mkdir(parents=True, exist_ok=True) + s3.download_file(s3_bucket, s3_key, str(local_file_path)) + + +def list_s3_files(bucket, prefix): + """List all S3 object keys in a bucket with the given prefix.""" + s3 = boto3.client('s3') + paginator = s3.get_paginator('list_objects_v2') + files = [] + for page in paginator.paginate(Bucket=bucket, Prefix=prefix): + for obj in page.get('Contents', []): + files.append(obj['Key']) + return files + + +def process_one_file(filekey: str, local_folder_downloaded: Path, local_folder_processed: Path, s3_prefix_new: str): + """download, process to new file, then upload""" + log = Logger('Process One') + tqdm.write(f'Processing {filekey}') + filename = Path(filekey).name + local_file_path_downloaded = local_folder_downloaded / filename + local_file_path_processed = local_folder_processed / filename + s3key_new = s3_prefix_new + filename + try: + log.debug(f'Downloading to {local_file_path_downloaded}') + download_s3_file(DEFAULT_DATA_BUCKET, filekey, local_file_path_downloaded) + log.debug(f'Processing to {local_file_path_processed}') + process_pq1_to_pq2(local_file_path_downloaded, local_file_path_processed) + log.debug(f'Uploading to {s3key_new}') + upload_to_s3(local_file_path_processed, DEFAULT_DATA_BUCKET, s3key_new) + except Exception as e: + log.error(f"Failed to process {filename}: {e}") + + +def process_multiple(filepattern: str): + """process all files starting with filepattern (empty means all files)""" + log = Logger("Process multiple") + s3_prefix = 'data_exchange/riverscape_metrics/' + s3_prefix_new = 'data_exchange/rs_metric_engine2/' + local_folder_downloaded = DATA_ROOT / "from-s3-rsathena-data_exchange_rsmetrics" + local_folder_processed = DATA_ROOT / "to-s3-rsathena-data_exchange_rsmetrics2" + + files = list_s3_files(DEFAULT_DATA_BUCKET, s3_prefix + filepattern) + log.info(f'Found {len(files)} files matching pattern {filepattern}') + with ThreadPoolExecutor(max_workers=12) as executor: # ADJUST as needed + futures = [executor.submit(process_one_file, filekey, local_folder_downloaded, local_folder_processed, s3_prefix_new) for filekey in files] + for _ in tqdm(as_completed(futures), total=len(futures)): + pass # Optionally handle results or exceptions here + + +def process_pq1_to_pq2(inputpqpath: Path, outputpqpath: Path, tolerance: float = 11): + """take a geo-parquet file, add a simplified geometry column, save back to new geo-parquet file""" + gdf = gpd.read_parquet(inputpqpath) + + # Reproject to EPSG:5070 for simplification + gdf_proj = gdf.to_crs(epsg=5070) + + # Use simplify_coverage for topology-preserving simplification + gdf["geometry_simplified"] = gdf_proj.geometry.simplify_coverage(tolerance=tolerance) + # Reproject simplified geometry back to EPSG:4326 + gdf["geometry_simplified"] = gpd.GeoSeries(gdf["geometry_simplified"], crs=5070).to_crs(epsg=4326) + gdf = gdf.set_crs(epsg=4326) + gdf = gdf.reset_index(drop=True) + gdf.to_parquet(outputpqpath) + + +def main(): + """Main entry point""" + log = Logger('Setup') + log.setup(log_path=str(DATA_ROOT / 'add_simplified_geom.log'), log_level=logging.INFO) + log.title('Add simplified geometry') + process_multiple('rme') + log.title('Completed.') + + +if __name__ == '__main__': + main() diff --git a/scripts/rpt_rme/layer_definitions.json b/scripts/rpt_rme/layer_definitions.json index 59fa241..2eff985 100644 --- a/scripts/rpt_rme/layer_definitions.json +++ b/scripts/rpt_rme/layer_definitions.json @@ -1,39 +1,39 @@ { - "$schema": "https://xml.riverscapes.net/riverscapes_metadata/schema/layer_definitions.schema.json", - "authority_name": "rpt_rme", - "tool_schema_version": "1.0.2", - "layers": [ + "$schema": "https://xml.riverscapes.net/riverscapes_metadata/schema/layer_definitions.schema.json", + "tool_schema_name": "rpt_rme", + "tool_schema_version": "1.0.2", + "layers": [ + { + "layer_id": "rpt_rme", + "layer_name": "rpt_rme_pq", + "description": "report view", + "layer_type": "DataTable", + "columns": [ { - "layer_id": "rpt_rme", - "layer_name": "rpt_rme_pq", - "description": "report view", - "layer_type": "DataTable", - "columns": [ - { - "name": "fcode_desc", - "dtype": "TEXT", - "friendly_name": "Flow Type", - "data_unit": "", - "description": "", - "is_key": false, - "is_required": false, - "theme": "", - "preferred_bin_definition": "", - "default_value": null - }, - { - "name": "ownership_desc", - "dtype": "TEXT", - "friendly_name": "Ownership", - "data_unit": "", - "description": "", - "is_key": false, - "is_required": false, - "theme": "", - "preferred_bin_definition": "", - "default_value": null - } - ] + "name": "fcode_desc", + "dtype": "STRING", + "friendly_name": "Flow Type", + "data_unit": "", + "description": "", + "is_key": false, + "is_required": false, + "theme": "", + "preferred_bin_definition": "", + "default_value": null + }, + { + "name": "ownership_desc", + "dtype": "STRING", + "friendly_name": "Ownership", + "data_unit": "", + "description": "", + "is_key": false, + "is_required": false, + "theme": "", + "preferred_bin_definition": "", + "default_value": null } - ] + ] + } + ] } \ No newline at end of file diff --git a/scripts/utility/compare_parquet_files.py b/scripts/utility/compare_parquet_files.py new file mode 100644 index 0000000..1345410 --- /dev/null +++ b/scripts/utility/compare_parquet_files.py @@ -0,0 +1,65 @@ +"""compare parquet files - ingoring column order""" +import hashlib +import pandas as pd + + +def hash_dataframe_content(df: pd.DataFrame) -> str: + """ + Generate a hash for a DataFrame, normalizing column order and handling binary columns. + - Columns are sorted alphabetically. + - Binary columns (bytes/bytearray) are converted to hex strings. + - All other columns are converted to string. + - The index is ignored in the hash. + + Args: + df (pd.DataFrame): The DataFrame to hash. + + Returns: + str: The MD5 hash of the DataFrame content. + """ + df_copy = df.reindex(sorted(df.columns), axis=1).copy() + for col in df_copy.columns: + if df_copy[col].dtype == 'object': + # Try to detect binary columns by checking the first non-null value + sample = df_copy[col].dropna().iloc[0] if not df_copy[col].dropna().empty else None + if isinstance(sample, (bytes, bytearray)): + # Convert binary columns to hex string + df_copy[col] = df_copy[col].apply(lambda x: x.hex() if isinstance(x, (bytes, bytearray)) else str(x)) + else: + df_copy[col] = df_copy[col].astype(str) + else: + df_copy[col] = df_copy[col].astype(str) + return hashlib.md5(pd.util.hash_pandas_object(df_copy, index=False).values).hexdigest() + + +def compare_parquet_files(file1: str, file2: str) -> None: + """ + Compare two Parquet files for content equality, ignoring column order. + Prints the hash of each file and whether their content matches. + + Args: + file1 (str): Path to the first Parquet file. + file2 (str): Path to the second Parquet file. + """ + df1 = pd.read_parquet(file1) + df2 = pd.read_parquet(file2) + hash1 = hash_dataframe_content(df1) + hash2 = hash_dataframe_content(df2) + print(f"File 1: {file1}") + print(f"File 2: {file2}") + print("Hash 1:", hash1) + print("Hash 2:", hash2) + print("Files have the same content (ignoring column order):", hash1 == hash2) + + +def main(): + """main + Replace with your file paths + """ + file1 = r"C:\nardata\pydataroot\rme-athena\downloads\0713001203\Jan15_rme_0713001203.parquet" + file2 = r"C:\nardata\pydataroot\rme-athena\downloads\0713001203\new_rme_0713001203.parquet" + compare_parquet_files(file1, file2) + + +if __name__ == "__main__": + main() diff --git a/uv.lock b/uv.lock index 0e77427..da02e85 100644 --- a/uv.lock +++ b/uv.lock @@ -211,6 +211,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" }, ] +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + [[package]] name = "contourpy" version = "1.3.3" @@ -940,6 +949,7 @@ dependencies = [ { name = "setuptools" }, { name = "six" }, { name = "termcolor" }, + { name = "tqdm" }, { name = "urllib3" }, ] @@ -973,6 +983,7 @@ requires-dist = [ { name = "shapely", marker = "extra == 'geo'", specifier = ">=2.1.1" }, { name = "six", specifier = "==1.17.0" }, { name = "termcolor", specifier = "==2.5.0" }, + { name = "tqdm", specifier = ">=4.67.1" }, { name = "urllib3", specifier = ">=2.3" }, ] provides-extras = ["geo"] @@ -1354,6 +1365,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7f/be/df630c387a0a054815d60be6a97eb4e8f17385d5d6fe660e1c02750062b4/termcolor-2.5.0-py3-none-any.whl", hash = "sha256:37b17b5fc1e604945c2642c872a3764b5d547a48009871aea3edd3afa180afb8", size = 7755, upload-time = "2024-10-06T19:50:02.097Z" }, ] +[[package]] +name = "tqdm" +version = "4.67.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0"