diff --git a/databricks.yml b/databricks.yml index d960794e..11346fb2 100644 --- a/databricks.yml +++ b/databricks.yml @@ -24,17 +24,13 @@ targets: mode: development default: true workspace: - host: https://adb-4243551358552236.16.azuredatabricks.net + host: https://adb-6450443583208388.8.azuredatabricks.net prod: mode: production - run_as: - service_principal_name: b61ef435-b155-42c7-8c9e-2a2442e280cd workspace: - host: https://adb-4243551358552236.16.azuredatabricks.net - root_path: /Workspace/Jobs/nhp_devs/${bundle.name}/${bundle.target} + host: https://adb-6450443583208388.8.azuredatabricks.net + root_path: /Workspace/Users/${workspace.current_user.userName}/.bundle/${bundle.name}/${bundle.target} permissions: - - group_name: nhp_devs - level: CAN_MANAGE - - service_principal_name: b61ef435-b155-42c7-8c9e-2a2442e280cd + - group_name: UDAL - Databricks - NewHospitalProgramme - PROD level: CAN_MANAGE diff --git a/databricks_workflows/nhp_data-ecds.yaml b/databricks_workflows/nhp_data-ecds.yaml index 8d8dedf8..862fa3b2 100644 --- a/databricks_workflows/nhp_data-ecds.yaml +++ b/databricks_workflows/nhp_data-ecds.yaml @@ -2,14 +2,6 @@ resources: jobs: Generate_NHP_Data_AAE_ECDS: name: Generate NHP Data (AAE/ECDS) - webhook_notifications: - on_success: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - on_failure: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - notification_settings: - no_alert_for_skipped_runs: true - no_alert_for_canceled_runs: true tasks: - task_key: run_ecds condition_task: @@ -23,7 +15,7 @@ resources: python_wheel_task: package_name: nhp_data entry_point: raw_data-aae - job_cluster_key: generate_nhp_ecds + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: nhp-raw_data-ecds @@ -32,7 +24,7 @@ resources: python_wheel_task: package_name: nhp_data entry_point: raw_data-ecds - job_cluster_key: generate_nhp_ecds + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: nhp-aggregated_data-ecds @@ -41,7 +33,7 @@ resources: python_wheel_task: package_name: nhp_data entry_point: aggregated_data-ecds - job_cluster_key: generate_nhp_ecds + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: nhp-default-ecds @@ -50,20 +42,33 @@ resources: python_wheel_task: package_name: nhp_data entry_point: default-ecds - job_cluster_key: generate_nhp_ecds + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl job_clusters: - - job_cluster_key: generate_nhp_ecds + - job_cluster_key: nhp_data new_cluster: - cluster_name: "" - spark_version: 16.4.x-scala2.13 - instance_pool_id: 0129-130615-maw351-pool-pss8mvfy - data_security_mode: SINGLE_USER - runtime_engine: PHOTON - autoscale: - min_workers: 2 - max_workers: 8 + spark_version: 17.3.x-scala2.13 + spark_conf: + spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite: "true" + spark.databricks.delta.properties.defaults.autoOptimize.autoCompact: "true" + spark.databricks.sql.initial.catalog.namespace: udal_lake_mart + azure_attributes: + availability: SPOT_WITH_FALLBACK_AZURE + spot_bid_max_price: 100 + node_type_id: Standard_E16_v3 + custom_tags: + Application: UDAL + Workload_Type: Job + Workload_Size: Small + spark_env_vars: + PYSPARK_PYTHON: /databricks/python3/bin/python3 + policy_id: 000816B2986C0B29 + data_security_mode: USER_ISOLATION + runtime_engine: STANDARD + kind: CLASSIC_PREVIEW + is_single_node: false + num_workers: 1 tags: group: nhp_data env: ${bundle.target} diff --git a/databricks_workflows/nhp_data-extract_nhp_for_containers.yaml b/databricks_workflows/nhp_data-extract_nhp_for_containers.yaml index 8de84573..2b674242 100644 --- a/databricks_workflows/nhp_data-extract_nhp_for_containers.yaml +++ b/databricks_workflows/nhp_data-extract_nhp_for_containers.yaml @@ -2,11 +2,6 @@ resources: jobs: Extract_NHP_for_containers: name: Extract NHP for containers - webhook_notifications: - on_success: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - on_failure: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca tasks: - task_key: run_extract_apc_data for_each_task: @@ -19,7 +14,7 @@ resources: parameters: - "{{job.parameters.data_version}}" - "{{input}}" - job_cluster_key: run_nhp_extracts_cluster + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: run_extract_opa_data @@ -35,7 +30,7 @@ resources: parameters: - "{{job.parameters.data_version}}" - "{{input}}" - job_cluster_key: run_nhp_extracts_cluster + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: run_extract_ecds_data @@ -51,7 +46,7 @@ resources: parameters: - "{{job.parameters.data_version}}" - "{{input}}" - job_cluster_key: run_nhp_extracts_cluster + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: run_extract_demographic_factors_data @@ -68,7 +63,7 @@ resources: - "{{job.parameters.data_version}}" - "{{input}}" - "{{job.parameters.projection_year}}" - job_cluster_key: run_nhp_extracts_cluster + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: run_extract_birth_factors_data @@ -85,7 +80,7 @@ resources: - "{{job.parameters.data_version}}" - "{{input}}" - "{{job.parameters.projection_year}}" - job_cluster_key: run_nhp_extracts_cluster + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: run_extract_inequalities_data @@ -101,7 +96,7 @@ resources: parameters: - "{{job.parameters.data_version}}" - "{{input}}" - job_cluster_key: run_nhp_extracts_cluster + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: generate_provider_gams @@ -112,7 +107,7 @@ resources: entry_point: model_data-health_status_adjustment-generate_provider_gams parameters: - "{{job.parameters.data_version}}" - job_cluster_key: run_nhp_extracts_cluster + job_cluster_key: nhp_data libraries: - pypi: package: pygam==0.9.1 @@ -125,7 +120,7 @@ resources: entry_point: model_data-health_status_adjustment-generate_icb_gams parameters: - "{{job.parameters.data_version}}" - job_cluster_key: run_nhp_extracts_cluster + job_cluster_key: nhp_data libraries: - pypi: package: pygam==0.9.1 @@ -138,7 +133,7 @@ resources: entry_point: model_data-health_status_adjustment-generate_national_gams parameters: - "{{job.parameters.data_version}}" - job_cluster_key: run_nhp_extracts_cluster + job_cluster_key: nhp_data libraries: - pypi: package: pygam==0.9.1 @@ -156,7 +151,7 @@ resources: parameters: - "{{input}}" - "20251001" - job_cluster_key: run_nhp_extracts_cluster + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: clean_up @@ -167,32 +162,42 @@ resources: entry_point: model_data-clean_up parameters: - "{{job.parameters.data_version}}" - job_cluster_key: run_nhp_extracts_cluster + job_cluster_key: nhp_data + libraries: + - whl: ../dist/*.whl + - task_key: move_data + depends_on: + - task_key: clean_up + python_wheel_task: + package_name: nhp_data + entry_point: extract_data + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl job_clusters: - - job_cluster_key: run_nhp_extracts_cluster + - job_cluster_key: nhp_data new_cluster: - cluster_name: "" - spark_version: 16.4.x-scala2.13 + spark_version: 17.3.x-scala2.13 spark_conf: - spark.master: local[*, 4] - spark.databricks.cluster.profile: singleNode + spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite: "true" + spark.databricks.delta.properties.defaults.autoOptimize.autoCompact: "true" + spark.databricks.sql.initial.catalog.namespace: udal_lake_mart azure_attributes: - first_on_demand: 1 - availability: ON_DEMAND_AZURE - spot_bid_max_price: -1 - node_type_id: Standard_E8ads_v5 - driver_node_type_id: Standard_E8ads_v5 + availability: SPOT_WITH_FALLBACK_AZURE + spot_bid_max_price: 100 + node_type_id: Standard_E16_v3 custom_tags: - ResourceClass: SingleNode - project: nhp + Application: UDAL + Workload_Type: Job + Workload_Size: Small spark_env_vars: PYSPARK_PYTHON: /databricks/python3/bin/python3 - enable_elastic_disk: true - data_security_mode: SINGLE_USER + policy_id: 000816B2986C0B29 + data_security_mode: USER_ISOLATION runtime_engine: STANDARD - num_workers: 0 + kind: CLASSIC_PREVIEW + is_single_node: false + num_workers: 1 tags: group: nhp_data_model env: ${bundle.target} diff --git a/databricks_workflows/nhp_data-inpatients.yaml b/databricks_workflows/nhp_data-inpatients.yaml index 3e2d5f32..c21be391 100644 --- a/databricks_workflows/nhp_data-inpatients.yaml +++ b/databricks_workflows/nhp_data-inpatients.yaml @@ -2,14 +2,6 @@ resources: jobs: Generate_NHP_Data_IP: name: Generate NHP Data (IP) - webhook_notifications: - on_success: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - on_failure: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - notification_settings: - no_alert_for_skipped_runs: true - no_alert_for_canceled_runs: true tasks: - task_key: run_ip condition_task: @@ -23,7 +15,7 @@ resources: python_wheel_task: package_name: nhp_data entry_point: raw_data-inpatients - job_cluster_key: generate_nhp_apc + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: nhp-raw_data-apc_mitigators @@ -32,7 +24,7 @@ resources: python_wheel_task: package_name: nhp_data entry_point: raw_data-inpatients_mitigators - job_cluster_key: generate_nhp_apc + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: nhp-default-apc @@ -41,20 +33,33 @@ resources: python_wheel_task: package_name: nhp_data entry_point: default-apc - job_cluster_key: generate_nhp_apc + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl job_clusters: - - job_cluster_key: generate_nhp_apc + - job_cluster_key: nhp_data new_cluster: - cluster_name: "" - spark_version: 16.4.x-scala2.13 - instance_pool_id: 0129-130615-maw351-pool-pss8mvfy - data_security_mode: SINGLE_USER - runtime_engine: PHOTON - autoscale: - min_workers: 2 - max_workers: 8 + spark_version: 17.3.x-scala2.13 + spark_conf: + spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite: "true" + spark.databricks.delta.properties.defaults.autoOptimize.autoCompact: "true" + spark.databricks.sql.initial.catalog.namespace: udal_lake_mart + azure_attributes: + availability: SPOT_WITH_FALLBACK_AZURE + spot_bid_max_price: 100 + node_type_id: Standard_E16_v3 + custom_tags: + Application: UDAL + Workload_Type: Job + Workload_Size: Small + spark_env_vars: + PYSPARK_PYTHON: /databricks/python3/bin/python3 + policy_id: 000816B2986C0B29 + data_security_mode: USER_ISOLATION + runtime_engine: STANDARD + kind: CLASSIC_PREVIEW + is_single_node: false + num_workers: 1 tags: group: nhp_data env: ${bundle.target} diff --git a/databricks_workflows/nhp_data-inputs-lad23cd.yaml b/databricks_workflows/nhp_data-inputs-lad23cd.yaml index be1359a8..422101c2 100644 --- a/databricks_workflows/nhp_data-inputs-lad23cd.yaml +++ b/databricks_workflows/nhp_data-inputs-lad23cd.yaml @@ -2,11 +2,6 @@ resources: jobs: Generate_NHP_Data_Inputs_lad23cd: name: "Generate NHP Data (Inputs: Local Authorities)" - webhook_notifications: - on_success: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - on_failure: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca tasks: - task_key: run_inputs_data condition_task: @@ -22,7 +17,7 @@ resources: entry_point: inputs_data-age_sex parameters: - lad23cd - job_cluster_key: nhp_inputs_data_extract + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: extract_baseline @@ -33,7 +28,7 @@ resources: entry_point: inputs_data-baseline parameters: - lad23cd - job_cluster_key: nhp_inputs_data_extract + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: extract_diagnoses @@ -44,7 +39,7 @@ resources: entry_point: inputs_data-diagnoses parameters: - lad23cd - job_cluster_key: nhp_inputs_data_extract + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: extract_procedures @@ -55,7 +50,7 @@ resources: entry_point: inputs_data-procedures parameters: - lad23cd - job_cluster_key: nhp_inputs_data_extract + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: extract_rates @@ -66,22 +61,33 @@ resources: entry_point: inputs_data-rates parameters: - lad23cd - job_cluster_key: nhp_inputs_data_extract + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl job_clusters: - - job_cluster_key: nhp_inputs_data_extract + - job_cluster_key: nhp_data new_cluster: - cluster_name: "" - spark_version: 16.4.x-scala2.13 + spark_version: 17.3.x-scala2.13 + spark_conf: + spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite: "true" + spark.databricks.delta.properties.defaults.autoOptimize.autoCompact: "true" + spark.databricks.sql.initial.catalog.namespace: udal_lake_mart + azure_attributes: + availability: SPOT_WITH_FALLBACK_AZURE + spot_bid_max_price: 100 + node_type_id: Standard_E16_v3 + custom_tags: + Application: UDAL + Workload_Type: Job + Workload_Size: Small spark_env_vars: PYSPARK_PYTHON: /databricks/python3/bin/python3 - instance_pool_id: 0129-130615-maw351-pool-pss8mvfy - data_security_mode: SINGLE_USER - runtime_engine: PHOTON - autoscale: - min_workers: 2 - max_workers: 8 + policy_id: 000816B2986C0B29 + data_security_mode: USER_ISOLATION + runtime_engine: STANDARD + kind: CLASSIC_PREVIEW + is_single_node: false + num_workers: 1 tags: group: nhp_data env: ${bundle.target} diff --git a/databricks_workflows/nhp_data-inputs-provider.yaml b/databricks_workflows/nhp_data-inputs-provider.yaml index 4165f32b..0ad745cb 100644 --- a/databricks_workflows/nhp_data-inputs-provider.yaml +++ b/databricks_workflows/nhp_data-inputs-provider.yaml @@ -2,11 +2,6 @@ resources: jobs: Generate_NHP_Data_Inputs_provider: name: "Generate NHP Data (Inputs: Provider)" - webhook_notifications: - on_success: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - on_failure: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca tasks: - task_key: run_inputs_data condition_task: @@ -22,7 +17,7 @@ resources: entry_point: inputs_data-age_sex parameters: - provider - job_cluster_key: nhp_inputs_data_extract + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: extract_baseline @@ -33,7 +28,7 @@ resources: entry_point: inputs_data-baseline parameters: - provider - job_cluster_key: nhp_inputs_data_extract + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: extract_diagnoses @@ -44,7 +39,7 @@ resources: entry_point: inputs_data-diagnoses parameters: - provider - job_cluster_key: nhp_inputs_data_extract + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: extract_procedures @@ -55,7 +50,7 @@ resources: entry_point: inputs_data-procedures parameters: - provider - job_cluster_key: nhp_inputs_data_extract + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: extract_rates @@ -66,7 +61,7 @@ resources: entry_point: inputs_data-rates parameters: - provider - job_cluster_key: nhp_inputs_data_extract + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: extract_expat_repat @@ -77,7 +72,7 @@ resources: entry_point: inputs_data-expat_repat parameters: - provider - job_cluster_key: nhp_inputs_data_extract + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: extract_inequalities @@ -88,22 +83,33 @@ resources: entry_point: inputs_data-inequalities parameters: - provider - job_cluster_key: nhp_inputs_data_extract + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl job_clusters: - - job_cluster_key: nhp_inputs_data_extract + - job_cluster_key: nhp_data new_cluster: - cluster_name: "" - spark_version: 16.4.x-scala2.13 + spark_version: 17.3.x-scala2.13 + spark_conf: + spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite: "true" + spark.databricks.delta.properties.defaults.autoOptimize.autoCompact: "true" + spark.databricks.sql.initial.catalog.namespace: udal_lake_mart + azure_attributes: + availability: SPOT_WITH_FALLBACK_AZURE + spot_bid_max_price: 100 + node_type_id: Standard_E16_v3 + custom_tags: + Application: UDAL + Workload_Type: Job + Workload_Size: Small spark_env_vars: PYSPARK_PYTHON: /databricks/python3/bin/python3 - instance_pool_id: 0129-130615-maw351-pool-pss8mvfy - data_security_mode: SINGLE_USER - runtime_engine: PHOTON - autoscale: - min_workers: 2 - max_workers: 8 + policy_id: 000816B2986C0B29 + data_security_mode: USER_ISOLATION + runtime_engine: STANDARD + kind: CLASSIC_PREVIEW + is_single_node: false + num_workers: 1 tags: group: nhp_data env: ${bundle.target} diff --git a/databricks_workflows/nhp_data-outpatients.yaml b/databricks_workflows/nhp_data-outpatients.yaml index f93c1aab..7c4b38ed 100644 --- a/databricks_workflows/nhp_data-outpatients.yaml +++ b/databricks_workflows/nhp_data-outpatients.yaml @@ -2,14 +2,6 @@ resources: jobs: Generate_NHP_Data_OP: name: Generate NHP Data (OP) - webhook_notifications: - on_success: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - on_failure: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - notification_settings: - no_alert_for_skipped_runs: true - no_alert_for_canceled_runs: true tasks: - task_key: run_op condition_task: @@ -23,7 +15,7 @@ resources: python_wheel_task: package_name: nhp_data entry_point: raw_data-outpatients - job_cluster_key: generate_nhp_opa + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: nhp-raw_data-opa_mitigators @@ -32,7 +24,7 @@ resources: python_wheel_task: package_name: nhp_data entry_point: raw_data-outpatients_mitigators - job_cluster_key: generate_nhp_opa + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: nhp-aggregated_data-opa @@ -41,7 +33,7 @@ resources: python_wheel_task: package_name: nhp_data entry_point: aggregated_data-outpatients - job_cluster_key: generate_nhp_opa + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: nhp-default-opa @@ -50,20 +42,33 @@ resources: python_wheel_task: package_name: nhp_data entry_point: default-opa - job_cluster_key: generate_nhp_opa + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl job_clusters: - - job_cluster_key: generate_nhp_opa + - job_cluster_key: nhp_data new_cluster: - cluster_name: "" - spark_version: 16.4.x-scala2.13 - instance_pool_id: 0129-130615-maw351-pool-pss8mvfy - data_security_mode: SINGLE_USER - runtime_engine: PHOTON - autoscale: - min_workers: 2 - max_workers: 8 + spark_version: 17.3.x-scala2.13 + spark_conf: + spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite: "true" + spark.databricks.delta.properties.defaults.autoOptimize.autoCompact: "true" + spark.databricks.sql.initial.catalog.namespace: udal_lake_mart + azure_attributes: + availability: SPOT_WITH_FALLBACK_AZURE + spot_bid_max_price: 100 + node_type_id: Standard_E16_v3 + custom_tags: + Application: UDAL + Workload_Type: Job + Workload_Size: Small + spark_env_vars: + PYSPARK_PYTHON: /databricks/python3/bin/python3 + policy_id: 000816B2986C0B29 + data_security_mode: USER_ISOLATION + runtime_engine: STANDARD + kind: CLASSIC_PREVIEW + is_single_node: false + num_workers: 1 tags: group: nhp_data env: ${bundle.target} diff --git a/databricks_workflows/nhp_data-population_projections.yaml b/databricks_workflows/nhp_data-population_projections.yaml index 38f17364..6b62796a 100644 --- a/databricks_workflows/nhp_data-population_projections.yaml +++ b/databricks_workflows/nhp_data-population_projections.yaml @@ -2,14 +2,6 @@ resources: jobs: Generate_NHP_Data_Population_Data: name: Generate NHP Data (Population Data) - webhook_notifications: - on_success: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - on_failure: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - notification_settings: - no_alert_for_skipped_runs: true - no_alert_for_canceled_runs: true tasks: - task_key: run_population_data condition_task: @@ -25,7 +17,7 @@ resources: entry_point: population_projections-get_ons_files_2022 parameters: - "{{job.parameters.projection_year}}" - job_cluster_key: generate_nhp_population + job_cluster_key: nhp_data libraries: - pypi: package: beautifulsoup4 @@ -45,7 +37,7 @@ resources: parameters: - "{{job.parameters.projection_year}}" - "{{input}}" - job_cluster_key: generate_nhp_population + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: national_population_projections @@ -61,7 +53,7 @@ resources: parameters: - "{{job.parameters.projection_year}}" - "{{input}}" - job_cluster_key: generate_nhp_population + job_cluster_key: nhp_data libraries: - pypi: package: xlrd @@ -69,20 +61,29 @@ resources: package: openpyxl - whl: ../dist/*.whl job_clusters: - - job_cluster_key: generate_nhp_population + - job_cluster_key: nhp_data new_cluster: - cluster_name: "" - spark_version: 16.4.x-scala2.13 + spark_version: 17.3.x-scala2.13 spark_conf: - spark.master: local[*, 4] - spark.databricks.cluster.profile: singleNode + spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite: "true" + spark.databricks.delta.properties.defaults.autoOptimize.autoCompact: "true" + spark.databricks.sql.initial.catalog.namespace: udal_lake_mart + azure_attributes: + availability: SPOT_WITH_FALLBACK_AZURE + spot_bid_max_price: 100 + node_type_id: Standard_DS3_v2 custom_tags: - ResourceClass: SingleNode - instance_pool_id: 0129-130615-maw351-pool-pss8mvfy - driver_instance_pool_id: 0129-130615-maw351-pool-pss8mvfy - data_security_mode: SINGLE_USER + Application: UDAL + Workload_Type: Job + Workload_Size: Small + spark_env_vars: + PYSPARK_PYTHON: /databricks/python3/bin/python3 + policy_id: 000816B2986C0B29 + data_security_mode: USER_ISOLATION runtime_engine: STANDARD - num_workers: 0 + kind: CLASSIC_PREVIEW + is_single_node: false + num_workers: 1 tags: group: nhp_data env: ${bundle.target} diff --git a/databricks_workflows/nhp_data-reference_data.yaml b/databricks_workflows/nhp_data-reference_data.yaml index a2411212..ba1786cb 100644 --- a/databricks_workflows/nhp_data-reference_data.yaml +++ b/databricks_workflows/nhp_data-reference_data.yaml @@ -2,14 +2,6 @@ resources: jobs: Generate_NHP_Data_Reference_Data: name: Generate NHP Data (Reference Data) - webhook_notifications: - on_success: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - on_failure: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - notification_settings: - no_alert_for_skipped_runs: true - no_alert_for_canceled_runs: true tasks: - task_key: run_reference_data condition_task: @@ -23,16 +15,34 @@ resources: python_wheel_task: package_name: nhp_data entry_point: reference-ods_trusts - job_cluster_key: generate_nhp_reference + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - - task_key: provider_main_icb + - task_key: lsoa_lookups depends_on: - task_key: ods_trusts + python_wheel_task: + package_name: nhp_data + entry_point: reference-lsoa_lookups + job_cluster_key: nhp_data + libraries: + - whl: ../dist/*.whl + - task_key: population_by_lsoa21 + depends_on: + - task_key: lsoa_lookups + python_wheel_task: + package_name: nhp_data + entry_point: reference-population_by_lsoa21 + job_cluster_key: nhp_data + libraries: + - whl: ../dist/*.whl + - task_key: provider_main_icb + depends_on: + - task_key: population_by_lsoa21 python_wheel_task: package_name: nhp_data entry_point: reference-provider_main_icb - job_cluster_key: generate_nhp_reference + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: population_by_imd_decile @@ -43,7 +53,7 @@ resources: entry_point: reference-population_by_imd_decile parameters: - "{{job.parameters.population_by_imd_decile_base_year}}" - job_cluster_key: generate_nhp_reference + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - task_key: create_provider_catchments @@ -52,53 +62,51 @@ resources: python_wheel_task: package_name: nhp_data entry_point: reference-provider_catchments - job_cluster_key: generate_nhp_reference + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - - task_key: create_day_procedure_code_list + - task_key: create_icb_catchments depends_on: - task_key: create_provider_catchments python_wheel_task: package_name: nhp_data - entry_point: reference-day_procedures - parameters: - - "{{job.parameters.day_procedures_base_year}}" - job_cluster_key: generate_nhp_reference - - - task_key: lsoa_lookups - depends_on: - - task_key: create_day_procedure_code_list - python_wheel_task: - package_name: nhp_data - entry_point: reference-lsoa_lookups - job_cluster_key: generate_nhp_reference + entry_point: reference-icb_catchments + job_cluster_key: nhp_data libraries: - whl: ../dist/*.whl - - task_key: population_by_lsoa21 + - task_key: create_day_procedure_code_list depends_on: - - task_key: lsoa_lookups + - task_key: create_icb_catchments python_wheel_task: package_name: nhp_data - entry_point: reference-population_by_lsoa21 - job_cluster_key: generate_nhp_reference - libraries: - - whl: ../dist/*.whl - + entry_point: reference-day_procedures + parameters: + - "{{job.parameters.day_procedures_base_year}}" + job_cluster_key: nhp_data job_clusters: - - job_cluster_key: generate_nhp_reference + - job_cluster_key: nhp_data new_cluster: - cluster_name: "" - spark_version: 16.4.x-scala2.13 + spark_version: 17.3.x-scala2.13 spark_conf: - spark.master: local[*, 4] - spark.databricks.cluster.profile: singleNode + spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite: "true" + spark.databricks.delta.properties.defaults.autoOptimize.autoCompact: "true" + spark.databricks.sql.initial.catalog.namespace: udal_lake_mart + azure_attributes: + availability: SPOT_WITH_FALLBACK_AZURE + spot_bid_max_price: 100 + node_type_id: Standard_E16_v3 custom_tags: - ResourceClass: SingleNode - instance_pool_id: 0129-130615-maw351-pool-pss8mvfy - driver_instance_pool_id: 0129-130615-maw351-pool-pss8mvfy - data_security_mode: SINGLE_USER + Application: UDAL + Workload_Type: Job + Workload_Size: Small + spark_env_vars: + PYSPARK_PYTHON: /databricks/python3/bin/python3 + policy_id: 000816B2986C0B29 + data_security_mode: USER_ISOLATION runtime_engine: STANDARD - num_workers: 0 + kind: CLASSIC_PREVIEW + is_single_node: false + num_workers: 1 tags: group: nhp_data env: ${bundle.target} diff --git a/databricks_workflows/nhp_data.yaml b/databricks_workflows/nhp_data.yaml index 529c7a42..8cad1fd4 100644 --- a/databricks_workflows/nhp_data.yaml +++ b/databricks_workflows/nhp_data.yaml @@ -2,14 +2,6 @@ resources: jobs: Generate_NHP_Data: name: Generate NHP Data - webhook_notifications: - on_success: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - on_failure: - - id: 11be34cb-f1f0-42ca-99d8-e7b3e75e20ca - notification_settings: - no_alert_for_skipped_runs: true - no_alert_for_canceled_runs: true tasks: - task_key: generate_nhp_reference_data run_job_task: diff --git a/pyproject.toml b/pyproject.toml index ac5606bb..6515dc1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,7 @@ description = "Add your description here" readme = "readme.md" requires-python = ">=3.12" dependencies = [ + "azure-storage-blob>=12.28.0", "beautifulsoup4>=4.13.4", "databricks-connect>=16.4", "mlflow>=3.3.2", @@ -84,4 +85,6 @@ reference-population_by_imd_decile = "nhp.data.reference.population_by_imd_decil reference-lsoa_lookups = "nhp.data.reference.lsoa_lookups:main" reference-population_by_lsoa21 = "nhp.data.reference.population_by_lsoa21:main" reference-provider_catchments = "nhp.data.reference.provider_catchments:main" +reference-icb_catchments = "nhp.data.reference.icb_catchments:main" +extract_data = "nhp.data.extract_data:main" \ No newline at end of file diff --git a/src/nhp/data/default/apc.py b/src/nhp/data/default/apc.py index 816c3157..ca6d9654 100644 --- a/src/nhp/data/default/apc.py +++ b/src/nhp/data/default/apc.py @@ -1,17 +1,19 @@ from pyspark.sql import SparkSession +from nhp.data.table_names import table_names + def create(spark: SparkSession) -> None: spark.sql( - """ - CREATE OR REPLACE VIEW nhp.default.apc + f""" + CREATE OR REPLACE VIEW {table_names.default_apc} AS SELECT * - FROM nhp.raw_data.apc a + FROM {table_names.raw_data_apc} a WHERE EXISTS ( SELECT 1 - FROM nhp.reference.ods_trusts + FROM {table_names.reference_ods_trusts} WHERE a.provider = org_to AND org_type LIKE 'ACUTE%' ) @@ -19,11 +21,11 @@ def create(spark: SparkSession) -> None: ) spark.sql( - """ - CREATE OR REPLACE VIEW nhp.default.apc_mitigators + f""" + CREATE OR REPLACE VIEW {table_names.default_apc_mitigators} AS SELECT * - FROM nhp.raw_data.apc_mitigators + FROM {table_names.raw_data_apc_mitigators} """ ) diff --git a/src/nhp/data/default/ecds.py b/src/nhp/data/default/ecds.py index 3c27e4c4..544113df 100644 --- a/src/nhp/data/default/ecds.py +++ b/src/nhp/data/default/ecds.py @@ -1,17 +1,19 @@ from pyspark.sql import SparkSession +from nhp.data.table_names import table_names + def create(spark: SparkSession) -> None: spark.sql( - """ - CREATE OR REPLACE VIEW nhp.default.ecds + f""" + CREATE OR REPLACE VIEW {table_names.default_ecds} AS SELECT * - FROM nhp.aggregated_data.ecds a + FROM {table_names.aggregated_data_ecds} a WHERE EXISTS ( SELECT 1 - FROM nhp.reference.ods_trusts + FROM {table_names.reference_ods_trusts} WHERE a.provider = org_to AND org_type LIKE 'ACUTE%' ) diff --git a/src/nhp/data/default/opa.py b/src/nhp/data/default/opa.py index da06dba3..0c4e8ce7 100644 --- a/src/nhp/data/default/opa.py +++ b/src/nhp/data/default/opa.py @@ -1,17 +1,19 @@ from pyspark.sql import SparkSession +from nhp.data.table_names import table_names + def create(spark: SparkSession) -> None: spark.sql( - """ - CREATE OR REPLACE VIEW nhp.default.opa + f""" + CREATE OR REPLACE VIEW {table_names.default_opa} AS SELECT * - FROM nhp.aggregated_data.opa a + FROM {table_names.aggregated_data_opa} a WHERE EXISTS ( SELECT 1 - FROM nhp.reference.ods_trusts + FROM {table_names.reference_ods_trusts} WHERE a.provider = org_to AND org_type LIKE 'ACUTE%' ) diff --git a/src/nhp/data/extract_data.py b/src/nhp/data/extract_data.py new file mode 100644 index 00000000..84291974 --- /dev/null +++ b/src/nhp/data/extract_data.py @@ -0,0 +1,69 @@ +"""Move data to NHP storage for use in the model. + +You will need to manaually generate a SAS token for both the data and the inputs +data containers, and store these in databricks secrets. + +The SAS tokens should have create and write permissions, and should have as +short an expiry time as possible (e.g. 1 day). + +Once you have generated the tokens, run + + databricks secrets put-secret nhp MLCSU_DATA_SAS + +and + + databricks secrets put-secret nhp MLCSU_INPUTS_DATA_SAS +""" + +import os + +from azure.storage.blob import BlobServiceClient +from pyspark.dbutils import DBUtils +from tqdm.auto import tqdm + +from nhp.data.get_spark import get_spark + + +def _move_files(path: str, container, extract_version: str): + all_files = list() + for root, dirs, files in os.walk(path): + for file in files: + src = os.path.join(root, file) + dst = os.path.relpath(src, path) + + all_files.append((src, dst)) + for src, dst in tqdm(all_files): + with open(src, "rb") as data: + container.upload_blob( + name=f"{extract_version}/{dst}", data=data, overwrite=True + ) + + +def move_inputs_data(dbutils: DBUtils, path: str): + sas = dbutils.secrets.get("nhp", "MLCSU_INPUTS_DATA_SAS") + bsc = BlobServiceClient(sas) + container = bsc.get_container_client("inputs-data") + _move_files(f"{path}/inputs_data/dev", container, "dev") + + +def move_data(dbutils: DBUtils, path: str): + sas = dbutils.secrets.get("nhp", "MLCSU_DATA_SAS") + bsc = BlobServiceClient(sas) + container = bsc.get_container_client("data") + _move_files(f"{path}/model_data/dev", container, "dev") + + +def main(): + spark = get_spark() + dbutils = DBUtils(spark) + + path = "/Volumes/udal_lake_mart/newhospitalprogramme/files" + + try: + move_inputs_data(dbutils, path) + move_data(dbutils, path) + except Exception as e: + print( + "Error moving files, have you followed the instructions for generating the SAS token?" + ) + raise e diff --git a/src/nhp/data/inputs_data/age_sex.py b/src/nhp/data/inputs_data/age_sex.py index ebaf4ea1..2c3db36a 100644 --- a/src/nhp/data/inputs_data/age_sex.py +++ b/src/nhp/data/inputs_data/age_sex.py @@ -12,6 +12,7 @@ from nhp.data.inputs_data.helpers import inputs_age_group from nhp.data.inputs_data.ip import get_ip_age_sex_data from nhp.data.inputs_data.op import get_op_age_sex_data +from nhp.data.inputs_data.save_parquet import save_parquet from nhp.data.table_names import table_names @@ -31,18 +32,22 @@ def get_age_sex(spark: SparkSession, geography_column: str) -> DataFrame: get_op_age_sex_data, ] - cols = ["fyear", geography_column, "strategy", "sex", "age", "n"] - - df = reduce( - DataFrame.unionByName, [f(spark, geography_column).select(*cols) for f in fns] - ) - age_groups = inputs_age_group(spark) + cols = ["fyear", geography_column, "strategy", "sex", "age", "n"] - return ( - df.join(age_groups, "age") - .groupBy(*cols[:-2], "age_group") - .agg(F.sum("n").alias("n")) + def get_data(fn): + df = ( + fn(spark, geography_column) + .join(age_groups, "age") + .groupBy(*cols[:-2], "age_group") + .agg(F.sum("n").alias("n")) + .cache() + ) + df.count() # materialise + return df + + return reduce(DataFrame.unionByName, map(get_data, fns)).orderBy( + *cols[:-2], "age_group" ) @@ -62,7 +67,7 @@ def save_age_sex(path: str, spark: SparkSession, geography_column: str) -> None: df = filter_acute_providers(spark, df, "provider") df = df.filter(F.col(geography_column) != "unknown") - df.toPandas().to_parquet(f"{path}/age_sex.parquet") + save_parquet(df, f"{path}/age_sex") def main(): diff --git a/src/nhp/data/inputs_data/baseline.py b/src/nhp/data/inputs_data/baseline.py index e724d645..23bc0c83 100644 --- a/src/nhp/data/inputs_data/baseline.py +++ b/src/nhp/data/inputs_data/baseline.py @@ -11,6 +11,7 @@ from nhp.data.inputs_data.ae.baseline import get_ae_baseline from nhp.data.inputs_data.ip.baseline import get_ip_baseline from nhp.data.inputs_data.op.baseline import get_op_baseline +from nhp.data.inputs_data.save_parquet import save_parquet from nhp.data.table_names import table_names @@ -26,7 +27,9 @@ def get_baseline(spark: SparkSession, geography_column: str) -> DataFrame: """ fns = [get_ae_baseline, get_ip_baseline, get_op_baseline] - return reduce(DataFrame.unionByName, [f(spark, geography_column) for f in fns]) + return reduce( + DataFrame.unionByName, [f(spark, geography_column) for f in fns] + ).orderBy("fyear", geography_column, "activity_type", "group", "tretspef") def save_baseline(path: str, spark: SparkSession, geography_column: str) -> None: @@ -43,7 +46,7 @@ def save_baseline(path: str, spark: SparkSession, geography_column: str) -> None df = filter_acute_providers(spark, df, "provider") df = df.filter(F.col(geography_column) != "unknown") - df.toPandas().to_parquet(f"{path}/baseline.parquet") + save_parquet(df, f"{path}/baseline") def main(): diff --git a/src/nhp/data/inputs_data/diagnoses.py b/src/nhp/data/inputs_data/diagnoses.py index 6631ca04..c8b3a007 100644 --- a/src/nhp/data/inputs_data/diagnoses.py +++ b/src/nhp/data/inputs_data/diagnoses.py @@ -11,6 +11,7 @@ from nhp.data.inputs_data.ae.diagnoses import get_ae_diagnoses from nhp.data.inputs_data.ip.diagnoses import get_ip_diagnoses from nhp.data.inputs_data.op.diagnoses import get_op_diagnoses +from nhp.data.inputs_data.save_parquet import save_parquet from nhp.data.table_names import table_names @@ -26,11 +27,13 @@ def get_diagnoses(spark: SparkSession, geography_column: str) -> DataFrame: """ fns = [get_ae_diagnoses, get_ip_diagnoses, get_op_diagnoses] - return reduce(DataFrame.unionByName, [f(spark, geography_column) for f in fns]) + return reduce( + DataFrame.unionByName, [f(spark, geography_column) for f in fns] + ).orderBy("fyear", geography_column, "strategy", "rn") def save_diagnoses(path: str, spark: SparkSession, geography_column: str) -> None: - """Save baseline data. + """Save diagnoses data. :param path: The path to save the data to :type path: str @@ -45,7 +48,7 @@ def save_diagnoses(path: str, spark: SparkSession, geography_column: str) -> Non df = filter_acute_providers(spark, df, "provider") df = df.filter(F.col(geography_column) != "unknown") - df.toPandas().to_parquet(f"{path}/diagnoses.parquet") + save_parquet(df, f"{path}/diagnoses") def main(): diff --git a/src/nhp/data/inputs_data/expat_repat.py b/src/nhp/data/inputs_data/expat_repat.py index 49866e7a..b53724c4 100644 --- a/src/nhp/data/inputs_data/expat_repat.py +++ b/src/nhp/data/inputs_data/expat_repat.py @@ -24,6 +24,7 @@ get_op_repat_local_data, get_op_repat_nonlocal_data, ) +from nhp.data.inputs_data.save_parquet import save_parquet from nhp.data.table_names import table_names @@ -37,7 +38,9 @@ def get_expat_data(spark: SparkSession) -> DataFrame: """ fns = [get_ae_expat_data, get_ip_expat_data, get_op_expat_data] - return reduce(DataFrame.unionByName, [f(spark) for f in fns]) + return reduce(DataFrame.unionByName, [f(spark) for f in fns]).orderBy( + "fyear", "provider", "activity_type", "group", "tretspef" + ) def get_repat_local_data(spark: SparkSession) -> DataFrame: @@ -50,7 +53,9 @@ def get_repat_local_data(spark: SparkSession) -> DataFrame: """ fns = [get_ae_repat_local_data, get_ip_repat_local_data, get_op_repat_local_data] - return reduce(DataFrame.unionByName, [f(spark) for f in fns]) + return reduce(DataFrame.unionByName, [f(spark) for f in fns]).orderBy( + "fyear", "icb", "provider", "activity_type", "group", "tretspef" + ) def get_repat_nonlocal_data(spark: SparkSession) -> DataFrame: @@ -67,7 +72,9 @@ def get_repat_nonlocal_data(spark: SparkSession) -> DataFrame: get_op_repat_nonlocal_data, ] - return reduce(DataFrame.unionByName, [f(spark) for f in fns]) + return reduce(DataFrame.unionByName, [f(spark) for f in fns]).orderBy( + "fyear", "icb", "is_main_icb", "provider", "activity_type", "group", "tretspef" + ) def save_expat_repat_data(path: str, spark: SparkSession) -> None: @@ -88,7 +95,8 @@ def save_expat_repat_data(path: str, spark: SparkSession) -> None: for name, fn in fns.items(): df = filter_acute_providers(spark, fn(spark), "provider") df = df.filter(F.col("provider") != "unknown") - df.toPandas().to_parquet(f"{path}/{name}.parquet") + + save_parquet(df, f"{path}/{name}") def main(): diff --git a/src/nhp/data/inputs_data/inequalities.py b/src/nhp/data/inputs_data/inequalities.py index 84cdcb40..06a4fa2d 100644 --- a/src/nhp/data/inputs_data/inequalities.py +++ b/src/nhp/data/inputs_data/inequalities.py @@ -17,6 +17,7 @@ ) from nhp.data.get_spark import get_spark +from nhp.data.inputs_data.save_parquet import save_parquet from nhp.data.table_names import table_names @@ -36,6 +37,7 @@ def load_inequalities_data(spark: SparkSession) -> DataFrame: .drop("imd19") .groupby("icb", "provider", "imd_quintile") .agg(F.sum("pop").alias("pop")) + .filter(F.col("pop") > 0) .withColumn( "population_share", F.col("pop") / F.sum("pop").over(Window.partitionBy(["icb", "provider"])), @@ -212,7 +214,7 @@ def save_inequalities(path: str, spark: SparkSession) -> None: :param spark: The spark session to use :type spark: SparkSession """ - mlflow.autolog( # ty: ignore[possibly-missing-attribute] + mlflow.autolog( log_input_examples=False, log_model_signatures=False, log_models=False, @@ -232,13 +234,13 @@ def save_inequalities(path: str, spark: SparkSession) -> None: .mode("overwrite") .saveAsTable(table_names.default_inequalities) ) - inequalities.toPandas().to_parquet(f"{path}/inequalities.parquet") + save_parquet(inequalities, f"{path}/inequalities") def main(): geography_column = sys.argv[1] if geography_column != "provider": - logging.info("skipping expat_repat data generation for non-provider geography") + logging.info("skipping inequalities data generation for non-provider geography") return path = f"{table_names.inputs_save_path}/{geography_column}" diff --git a/src/nhp/data/inputs_data/ip/rates.py b/src/nhp/data/inputs_data/ip/rates.py index bdf63a2f..a4ba8007 100644 --- a/src/nhp/data/inputs_data/ip/rates.py +++ b/src/nhp/data/inputs_data/ip/rates.py @@ -64,7 +64,9 @@ def get_ip_activity_avoidance_rates( ) df = complete_age_sex_rows(spark, df, geography_column) - return df.join(pop, ["fyear", "age", "sex", geography_column], "left") + return df.join(pop, ["fyear", "age", "sex", geography_column], "left").filter( + F.col("d") > 0 + ) @directly_standardise diff --git a/src/nhp/data/inputs_data/op/expat_repat.py b/src/nhp/data/inputs_data/op/expat_repat.py index 773c91ae..fafdbc39 100644 --- a/src/nhp/data/inputs_data/op/expat_repat.py +++ b/src/nhp/data/inputs_data/op/expat_repat.py @@ -33,6 +33,7 @@ def _get_icb_df(spark: SparkSession) -> DataFrame: .withColumn("group", F.lit("")) .groupBy("fyear", "icb", "is_main_icb", "provider", "group", "tretspef") .agg(F.sum("attendance").alias("count")) + .filter(F.col("count") > 0) .persist() ) diff --git a/src/nhp/data/inputs_data/op/rates.py b/src/nhp/data/inputs_data/op/rates.py index fddb028a..04f97511 100644 --- a/src/nhp/data/inputs_data/op/rates.py +++ b/src/nhp/data/inputs_data/op/rates.py @@ -18,10 +18,14 @@ def get_op_rates(spark: SparkSession, geography_column: str) -> DataFrame: :return: The outpatients activity avoidances rates :rtype: DataFrame """ - return get_op_age_sex_data(spark, geography_column).withColumn( - "d", - F.when( - F.col("strategy").startswith("followup_reduction_"), - F.col("d") - F.col("n"), - ).otherwise(F.col("d")), + return ( + get_op_age_sex_data(spark, geography_column) + .withColumn( + "d", + F.when( + F.col("strategy").startswith("followup_reduction_"), + F.col("d") - F.col("n"), + ).otherwise(F.col("d")), + ) + .filter(F.col("d") > 0) ) diff --git a/src/nhp/data/inputs_data/procedures.py b/src/nhp/data/inputs_data/procedures.py index fa393708..93f22813 100644 --- a/src/nhp/data/inputs_data/procedures.py +++ b/src/nhp/data/inputs_data/procedures.py @@ -11,6 +11,7 @@ from nhp.data.inputs_data.ae.procedures import get_ae_procedures from nhp.data.inputs_data.ip.procedures import get_ip_procedures from nhp.data.inputs_data.op.procedures import get_op_procedures +from nhp.data.inputs_data.save_parquet import save_parquet from nhp.data.table_names import table_names @@ -26,7 +27,9 @@ def get_procedures(spark: SparkSession, geography_column: str) -> DataFrame: """ fns = [get_ae_procedures, get_ip_procedures, get_op_procedures] - return reduce(DataFrame.unionByName, [f(spark, geography_column) for f in fns]) + return reduce( + DataFrame.unionByName, [f(spark, geography_column) for f in fns] + ).orderBy("fyear", geography_column, "strategy", "rn") def save_procedures(path: str, spark: SparkSession, geography_column: str) -> None: @@ -45,7 +48,7 @@ def save_procedures(path: str, spark: SparkSession, geography_column: str) -> No df = filter_acute_providers(spark, df, "provider") df = df.filter(F.col(geography_column) != "unknown") - df.toPandas().to_parquet(f"{path}/procedures.parquet") + save_parquet(df, f"{path}/procedures") def main(): diff --git a/src/nhp/data/inputs_data/rates.py b/src/nhp/data/inputs_data/rates.py index b70013d4..d6456c59 100644 --- a/src/nhp/data/inputs_data/rates.py +++ b/src/nhp/data/inputs_data/rates.py @@ -17,6 +17,7 @@ get_ip_preop_rates, ) from nhp.data.inputs_data.op.rates import get_op_rates +from nhp.data.inputs_data.save_parquet import save_parquet from nhp.data.table_names import table_names @@ -39,7 +40,9 @@ def get_rates(spark: SparkSession, geography_column: str) -> DataFrame: get_op_rates, ] - return reduce(DataFrame.unionByName, [f(spark, geography_column) for f in fns]) + return reduce( + DataFrame.unionByName, [f(spark, geography_column) for f in fns] + ).orderBy("fyear", geography_column, "strategy") def save_rates(path: str, spark: SparkSession, geography_column: str) -> None: @@ -59,7 +62,7 @@ def save_rates(path: str, spark: SparkSession, geography_column: str) -> None: df = filter_acute_providers(spark, df, "provider").unionByName(df_national) df = df.filter(F.col(geography_column) != "unknown") - df.toPandas().to_parquet(f"{path}/rates.parquet") + save_parquet(df, f"{path}/rates") def main(): diff --git a/src/nhp/data/inputs_data/save_parquet.py b/src/nhp/data/inputs_data/save_parquet.py new file mode 100644 index 00000000..3fa08b68 --- /dev/null +++ b/src/nhp/data/inputs_data/save_parquet.py @@ -0,0 +1,19 @@ +import os +import shutil + + +def save_parquet(df, path): + # save the dataframe + df.coalesce(1).write.mode("overwrite").parquet(path) + # find the created files + files = os.listdir(path) + # find just the parquet file + parquets = list(filter(lambda f: f.endswith(".parquet"), files)) + assert len(parquets) == 1, "Too many parquet files, should only be one." + parquet = parquets[0] + # move the parquet file + if os.path.exists(f"{path}.parquet"): + os.remove(f"{path}.parquet") + os.rename(f"{path}/{parquet}", f"{path}.parquet") + # remove the other files + shutil.rmtree(path) diff --git a/src/nhp/data/model_data/generate_synthetic_data.py b/src/nhp/data/model_data/generate_synthetic_data.py index 5921e579..9639a2d0 100644 --- a/src/nhp/data/model_data/generate_synthetic_data.py +++ b/src/nhp/data/model_data/generate_synthetic_data.py @@ -59,6 +59,11 @@ def read_file(self, file: str, path: str) -> DataFrame: def save_synth_file(self, file: str, df: pd.DataFrame) -> None: p = f"{self._synth_path}/{file}/fyear={self._fyear}/dataset=synthetic" os.makedirs(p, exist_ok=True) + df.attrs = { + k: v + for k, v in df.attrs.items() + if k not in ["metrics", "observed_metrics"] + } df.to_parquet(f"{p}/0.parquet") def generate(self) -> None: diff --git a/src/nhp/data/nhp_datasets/local_authorities.py b/src/nhp/data/nhp_datasets/local_authorities.py index 46253dac..47df5714 100644 --- a/src/nhp/data/nhp_datasets/local_authorities.py +++ b/src/nhp/data/nhp_datasets/local_authorities.py @@ -1,7 +1,4 @@ -from itertools import chain - from pyspark.sql import DataFrame, SparkSession -from pyspark.sql import functions as F from pyspark.sql.types import * # noqa: F403 from nhp.data.table_names import table_names @@ -24,37 +21,3 @@ def lsoa11_to_lad23(spark: SparkSession, df: DataFrame, col_name: str) -> DataFr ).withColumnRenamed("lsoa11cd", col_name) return df.join(lad23_lookup, col_name, "left") - - -def local_authority_successors( - spark: SparkSession, df: DataFrame, col_name: str -) -> DataFrame: - """Local Authority Successors - - If a local authority has been succeeded, replace it with the successor - - :param spark: the spark context - :type spark: SparkSession - :param df: the dataframe to add the main icb column to - :type df: DataFrame - :param col_name: the column to update - :type col_name: str - :return: the modified dataframe - :rtype: DataFrame - """ - - lad11_to_lad23_lookup = spark.read.csv( - table_names.reference_lad11_to_lad23_lookup, - header=True, - ) - - lookup = { - r["lad11cd"]: r["lad21cd"] - for r in lad11_to_lad23_lookup.select("lad11cd", "lad21cd").distinct().collect() - } - - lookup_map = F.create_map([F.lit(x) for x in chain(*lookup.items())]) - - return df.withColumn( - col_name, F.coalesce(lookup_map[F.col(col_name)], F.col(col_name)) - ) diff --git a/src/nhp/data/population_projections/get_ons_files_2022.py b/src/nhp/data/population_projections/get_ons_files_2022.py index c9bb4306..96237a1a 100644 --- a/src/nhp/data/population_projections/get_ons_files_2022.py +++ b/src/nhp/data/population_projections/get_ons_files_2022.py @@ -53,9 +53,11 @@ def get_snpp_uris(path: str) -> list[str]: matches = soup.find_all( "a", attrs={ - "aria-label": lambda x: x - and "2022-based" in x - and re.match("^Download (Population|Birth) projections", x) + "aria-label": lambda x: ( + x + and "2022-based" in x + and re.match("^Download (Population|Birth) projections", x) + ) }, ) @@ -174,6 +176,7 @@ def get_npp_uri(path: str) -> str: attrs={"aria-label": lambda x: x and re.match(pattern, x)}, ) assert match is not None, "could not find NPP zip file link" + assert isinstance(match["href"], str), "unexpected href type for NPP zip file link" return urljoin(ONS_URL, match["href"]) diff --git a/src/nhp/data/raw_data/ecds.py b/src/nhp/data/raw_data/ecds.py index 72d4a626..a46cc52a 100644 --- a/src/nhp/data/raw_data/ecds.py +++ b/src/nhp/data/raw_data/ecds.py @@ -19,10 +19,16 @@ def get_ecds_data(spark: SparkSession) -> DataFrame: """Get ECDS data""" - df = spark.read.table(table_names.hes_ecds) + df = ( + spark.read.table(table_names.hes_ecds) + .withColumnRenamed("pseudo_nhs_number_pet", "token_person_id") + .withColumn("token_person_id", F.col("token_person_id").cast("string")) + .withColumn("ec_ident", F.col("ec_ident").cast("string")) + ) # filter out mental health providers last: some MH providers run urgent care/MIUs # which we want to keep in for the frequent attenders df = add_provider(spark, df, "der_provider_code", "der_provider_site_code") + df = df.select([F.col(c).alias(c.lower()) for c in df.columns]) # Add IMD fields diff --git a/src/nhp/data/raw_data/inpatients.py b/src/nhp/data/raw_data/inpatients.py index 66d9bace..f68d9d6e 100644 --- a/src/nhp/data/raw_data/inpatients.py +++ b/src/nhp/data/raw_data/inpatients.py @@ -16,11 +16,41 @@ from nhp.data.table_names import table_names +def add_maternity_episode_type(spark: SparkSession, df: DataFrame) -> DataFrame: + epitype_2_5 = F.col("epitype").isin(["2", "5"]) + epitype_3_6 = F.col("epitype").isin(["3", "6"]) + delplac_1_5_6 = F.col("delplac").isin(["1", "5", "6"]) + classpat_3_4 = F.col("classpat").isin(["3", "4"]) + epistat_3 = F.col("epistat") == "3" + delmeth_not_null = F.col("delmeth").isNotNull() + + apc_births_table = f"{table_names.hes_apc}_births" + apc_births = ( + spark.read.table(apc_births_table) + .groupBy("epikey", "fyear", "procode3") + .agg(F.min("delmeth").alias("delmeth"), F.min("delplac").alias("delplac")) + ) + + return ( + df.join(apc_births, ["epikey", "fyear", "procode3"], "left") + .withColumn( + "maternity_episode_type", + F.when(epitype_2_5 & ~delplac_1_5_6 & ~classpat_3_4 & epistat_3, 1) + .when(epitype_3_6 & ~delplac_1_5_6 & ~classpat_3_4 & epistat_3, 2) + .when(epitype_2_5 & delplac_1_5_6 & ~classpat_3_4 & epistat_3, 3) + .when(epitype_3_6 & delplac_1_5_6 & ~classpat_3_4 & epistat_3, 4) + .when(delmeth_not_null, 9) + .otherwise(99), + ) + .select(*df.columns, "maternity_episode_type") + ) + + def get_inpatients_data(spark: SparkSession) -> DataFrame: """Get Inpatients Data""" # Spell has maternity delivery episode mat_delivery_spells = ( - spark.read.table(table_names.hes_apc) + add_maternity_episode_type(spark, spark.read.table(table_names.hes_apc)) .filter(F.col("fce") == 1) .filter(F.col("maternity_episode_type") == 1) .select("susspellid") diff --git a/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/alcohol.py b/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/alcohol.py index 3dc0aea6..5f89f80a 100644 --- a/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/alcohol.py +++ b/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/alcohol.py @@ -76,10 +76,6 @@ def _alcohol_wholly_attributable(): def _alcohol_partially_attributable(condition_group): spark = get_spark() - assert hasattr(spark, "sparkContext"), ( - "sparkContext is not available on the SparkSession object." - ) - sc = spark.sparkContext icd = spark.read.table(table_names.reference_icd10_codes) @@ -108,7 +104,7 @@ def _alcohol_partially_attributable(condition_group): ] aaf = ( - spark.read.json(sc.parallelize(aaf_list)) # ty: ignore[invalid-argument-type] + spark.createDataFrame(aaf_list) .join(icd.select("icd10"), F.expr("icd10 rlike regex")) .persist() ) diff --git a/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/evidence_based_interventions_msk.py b/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/evidence_based_interventions_msk.py index 883af24c..09a4bf09 100644 --- a/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/evidence_based_interventions_msk.py +++ b/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/evidence_based_interventions_msk.py @@ -63,8 +63,8 @@ def _arthroscopic_meniscal_tear(): # either there is a diagnosis of M323, or they have both M233 and M238 diagnoses diags = [ lambda x: x.admission_has(any_diagnosis, "M232"), - lambda x: ( - x.admission_has(any_diagnosis, "M233").admission_has(any_diagnosis, "M238") + lambda x: x.admission_has(any_diagnosis, "M233").admission_has( + any_diagnosis, "M238" ), ] diff --git a/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/obesity_related_admissions.py b/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/obesity_related_admissions.py index 66f4fdcc..e3b44ef2 100644 --- a/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/obesity_related_admissions.py +++ b/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/obesity_related_admissions.py @@ -13,7 +13,7 @@ randomly selects 36% of spells meeting these criteria. The OAFs are also sourced from the above referenced document.""" -import pyspark.sql.types as T +import pandas as pd from pyspark.sql import functions as F from nhp.data.get_spark import get_spark @@ -27,19 +27,7 @@ def _obesity_related_admissions(): spark = get_spark() filename = get_reference_file_path("obesity_attributable_fractions.csv") - oaf = ( - spark.read.option("header", "true") - .option("delimiter", ",") - .schema( - T.StructType( - [ - T.StructField("diagnosis", T.StringType(), False), - T.StructField("fraction", T.DoubleType(), False), - ] - ) - ) - .csv(f"file:///{filename}") - ) + oaf = spark.createDataFrame(pd.read_csv(filename)) return ( nhp_apc.join(diagnoses, ["epikey", "fyear"]) diff --git a/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/smoking_related_admissions.py b/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/smoking_related_admissions.py index 36d73345..cce61311 100644 --- a/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/smoking_related_admissions.py +++ b/src/nhp/data/raw_data/mitigators/ip/activity_avoidance/smoking_related_admissions.py @@ -21,7 +21,7 @@ [1]: https://www.rcplondon.ac.uk/projects/outputs/hiding-plain-sight-treating-tobacco-dependency-nhs """ -import pyspark.sql.types as T +import pandas as pd from pyspark.sql import functions as F from nhp.data.get_spark import get_spark @@ -37,20 +37,7 @@ def _smoking(): filename = get_reference_file_path("smoking_attributable_fractions.csv") - saf = ( - spark.read.option("header", "true") - .option("delimiter", ",") - .schema( - T.StructType( - [ - T.StructField("diagnoses", T.StringType(), False), - T.StructField("sex", T.IntegerType(), False), - T.StructField("value", T.DoubleType(), False), - ] - ) - ) - .csv(f"file:///{filename}") - ) + saf = spark.createDataFrame(pd.read_csv(filename)) icd10_codes = spark.read.table(table_names.reference_icd10_codes) diff --git a/src/nhp/data/raw_data/mitigators/ip/efficiency/excess_beddays.py b/src/nhp/data/raw_data/mitigators/ip/efficiency/excess_beddays.py index 309cfe7d..3adbabdd 100644 --- a/src/nhp/data/raw_data/mitigators/ip/efficiency/excess_beddays.py +++ b/src/nhp/data/raw_data/mitigators/ip/efficiency/excess_beddays.py @@ -27,7 +27,7 @@ """ # pylint: enable=line-too-long -import pyspark.sql.types as T +import pandas as pd from pyspark.sql import functions as F from nhp.data.get_spark import get_spark @@ -41,16 +41,13 @@ def _excess_beddays(group): filename = get_reference_file_path("hrg_trimpoints.csv") ebd = ( - spark.read.schema( - T.StructType( - [ - T.StructField("sushrg", T.StringType(), False), - T.StructField("elective", T.IntegerType(), True), - T.StructField("emergency", T.IntegerType(), True), - ] + spark.createDataFrame( + pd.read_csv( + filename, + na_values="-", + dtype={"sushrg": "str", "elective": "Int64", "emergency": "Int64"}, ) ) - .csv(f"file:///{filename}", sep=",", header=True, nullValue="-") .select("sushrg", F.col(group).alias("trimpoint")) .dropna() ) diff --git a/src/nhp/data/reference/day_procedures.py b/src/nhp/data/reference/day_procedures.py index 05c76c04..14ec018c 100644 --- a/src/nhp/data/reference/day_procedures.py +++ b/src/nhp/data/reference/day_procedures.py @@ -109,17 +109,21 @@ def get_day_procedure_code_list( ) .assign( prob_usually=lambda x: x.apply( - lambda y: binomtest( - y.value, y.total, p=P_USUALLY, alternative="greater" - ).pvalue, + lambda y: ( + binomtest( + y.value, y.total, p=P_USUALLY, alternative="greater" + ).pvalue + ), axis=1, ) ) .assign( prob_occasionally=lambda x: x.apply( - lambda y: binomtest( - y.value, y.total, p=P_OCCASIONALLY, alternative="greater" - ).pvalue, + lambda y: ( + binomtest( + y.value, y.total, p=P_OCCASIONALLY, alternative="greater" + ).pvalue + ), axis=1, ) ) diff --git a/src/nhp/data/reference/icb_catchments.py b/src/nhp/data/reference/icb_catchments.py index 4c83b2a5..3f64aad5 100644 --- a/src/nhp/data/reference/icb_catchments.py +++ b/src/nhp/data/reference/icb_catchments.py @@ -56,7 +56,7 @@ def get_icb_catchments(spark: SparkSession) -> DataFrame: if not features: break all_features.extend(features) - params["resultOffset"] += len(features) # ty: ignore[unsupported-operator] + params["resultOffset"] += len(features) lad_icb_lsoa_counts = defaultdict(lambda: defaultdict(lambda: 0)) icb_la_pcnts = [] diff --git a/src/nhp/data/reference/ods_trusts.py b/src/nhp/data/reference/ods_trusts.py index 1e7e336b..b4fcd946 100644 --- a/src/nhp/data/reference/ods_trusts.py +++ b/src/nhp/data/reference/ods_trusts.py @@ -77,7 +77,7 @@ def _get_text(elem: ET.Element, path: str) -> str | None: return e.text -def process_successor(x: ET.Element, org_code: str) -> dict: +def process_successor(x: ET.Element, org_code: str) -> dict[str, str | None]: """Process successor records :param x: xml element @@ -98,7 +98,9 @@ def process_successor(x: ET.Element, org_code: str) -> dict: } -def process_organisation(org: ET.Element) -> dict: +def process_organisation( + org: ET.Element, +) -> dict[str, str | list[dict[str, str | None]] | None]: """Process an organisation record :param org: xml element @@ -121,7 +123,9 @@ def process_organisation(org: ET.Element) -> dict: for i in org.findall("Date") if _get_attrib(i, "Type", "value") == "Operational" ][0] - org_dict["start_date"] = operational_date.find("Start").get("value") # ty: ignore[possibly-missing-attribute] + start_date = operational_date.find("Start") + assert start_date is not None, "Operational date must have a start date" + org_dict["start_date"] = start_date.get("value") end_date = operational_date.find("End") if end_date is not None: org_dict["end_date"] = end_date.get("value") @@ -130,12 +134,13 @@ def process_organisation(org: ET.Element) -> dict: if postcode is not None: org_dict["postcode"] = postcode.text - org_dict["successors"] = [ - process_successor(i, org_code) - for i in org.findall("Succs/Succ[Type='Predecessor']") - ] - - return org_dict + return { + **org_dict, + "successors": [ + process_successor(i, org_code) + for i in org.findall("Succs/Succ[Type='Predecessor']") + ], + } def get_successors_df(processed_orgs: list, ods_df: pd.DataFrame) -> pd.DataFrame: @@ -189,7 +194,7 @@ def get_successors_df(processed_orgs: list, ods_df: pd.DataFrame) -> pd.DataFram } ) if k: - q.append((k, end_date)) + q.append((k, end_date)) # ty: ignore[invalid-argument-type] successors_df = pd.DataFrame(transitive_closure).drop_duplicates() diff --git a/src/nhp/data/reference/population_by_imd_decile.py b/src/nhp/data/reference/population_by_imd_decile.py index 6ea75d37..40afd888 100644 --- a/src/nhp/data/reference/population_by_imd_decile.py +++ b/src/nhp/data/reference/population_by_imd_decile.py @@ -48,6 +48,8 @@ def create_population_by_imd_decile( .withColumn("age", F.col("age").cast("int")) .withColumn("imd19", F.col("imd19").cast("int")) .withColumnRenamed("lsoa11cd", "lsoa11") + .filter(F.col("pop") != "NA") + .withColumn("pop", F.col("pop").cast("int")) .join(age_groups, "age") .groupBy("lsoa11", "imd19", "sex", "age_group") .agg(F.sum("pop").alias("pop")) diff --git a/src/nhp/data/reference/population_by_lsoa21.py b/src/nhp/data/reference/population_by_lsoa21.py index f9ef645e..a130deb7 100644 --- a/src/nhp/data/reference/population_by_lsoa21.py +++ b/src/nhp/data/reference/population_by_lsoa21.py @@ -13,7 +13,7 @@ from nhp.data.table_names import table_names -def create_pop_by_lsoa21(): +def create_pop_by_lsoa21(spark: SparkSession, table: str) -> None: BASE_URL = "https://www.ons.gov.uk" url = "/".join( @@ -31,10 +31,17 @@ def create_pop_by_lsoa21(): soup = BeautifulSoup(resp.text, "html.parser") - def get_pop_by_lsoa_file(file_title: str, years: list[int]) -> pd.DataFrame: - def year_to_fyear(year): - return year * 100 + (year + 1) % 100 - + def year_to_fyear(year): + return year * 100 + (year + 1) % 100 + + spark.sql(f"DROP TABLE IF EXISTS {table}") + for file_title, years in [ + ("Mid-2011 to mid-2014", [2011, 2012, 2013, 2014]), + ("Mid-2015 to mid-2018", [2015, 2016, 2017, 2018]), + ("Mid-2019 to mid-2022", [2019, 2020, 2021]), + ("Mid-2022 revised (Nov 2025) to mid-2024", [2022, 2023, 2024]), + ]: + print(f"Processing file: {file_title}") file_link = soup.find( "a", attrs={"aria-label": lambda x: x and file_title in x} ) @@ -49,48 +56,33 @@ def year_to_fyear(year): file = io.BytesIO(response.content) - return pd.concat( - [ - ( - pd.read_excel(file, sheet_name=f"Mid-{year} LSOA 2021", skiprows=3) - .rename(columns={"LSOA 2021 Code": "lsoa21cd"}) - .melt( - id_vars="lsoa21cd", - value_vars=[f"{s}{i}" for s in "FM" for i in range(0, 91)], - var_name="sex_age", - value_name="population", - ) - .assign( - sex=lambda x: np.where(x["sex_age"].str[0] == "M", 1, 2), - age=lambda x: x["sex_age"].str[1:].astype("int"), - fyear=year_to_fyear(year), - ) - .drop(columns=["sex_age"]) + for year in years: + print(f"> {year}") + df = ( + pd.read_excel(file, sheet_name=f"Mid-{year} LSOA 2021", skiprows=3) + .rename(columns={"LSOA 2021 Code": "lsoa21cd"}) + .melt( + id_vars="lsoa21cd", + value_vars=[f"{s}{i}" for s in "FM" for i in range(0, 91)], + var_name="sex_age", + value_name="population", ) - for year in years - ] - ) - - df = pd.concat( - [ - get_pop_by_lsoa_file(*f) # ty: ignore[invalid-argument-type] - for f in [ - ("Mid-2011 to mid-2014", [2011, 2012, 2013, 2014]), - ("Mid-2015 to mid-2018", [2015, 2016, 2017, 2018]), - ("Mid-2019 to mid-2022", [2019, 2020, 2021]), - ("Mid-2022 revised (Nov 2025) to mid-2024", [2022, 2023, 2024]), - ] - ] - ) - - return df[["fyear", "lsoa21cd", "sex", "age", "population"]] + .assign( + sex=lambda x: np.where(x["sex_age"].str[0] == "M", 1, 2), + age=lambda x: x["sex_age"].str[1:].astype("int"), + fyear=year_to_fyear(year), + ) + .drop(columns=["sex_age"]) + ) + spark.createDataFrame(df).repartition(1).write.mode("append").saveAsTable( + table + ) def get_pop_by_lsoa21(spark: SparkSession) -> DataFrame: table = table_names.reference_pop_by_lsoa21 if not spark.catalog.tableExists(table): - pop_by_lsoa21 = create_pop_by_lsoa21() - spark.createDataFrame(pop_by_lsoa21).write.mode("overwrite").saveAsTable(table) + create_pop_by_lsoa21(spark, table) return spark.read.table(table) diff --git a/src/nhp/data/reference/provider_catchments.py b/src/nhp/data/reference/provider_catchments.py index 9e8dd260..f123185b 100644 --- a/src/nhp/data/reference/provider_catchments.py +++ b/src/nhp/data/reference/provider_catchments.py @@ -10,7 +10,7 @@ from pyspark.sql import functions as F from nhp.data.get_spark import get_spark -from nhp.data.reference.lsoa_lookups import get_lsoa11_to_lad23_lookup +from nhp.data.nhp_datasets.apc import hes_apc from nhp.data.reference.population_by_lsoa21 import get_pop_by_lad23 from nhp.data.table_names import table_names @@ -44,13 +44,14 @@ def create_provider_lad23_splits(spark: SparkSession) -> DataFrame: - Uses unique patient counts (person_id) to align with OHID methodology - Missing age/sex/LAD combinations are assigned to the most popular provider for that LAD """ - lsoa11_to_lad23 = get_lsoa11_to_lad23_lookup(spark) + acute_providers = spark.read.table(table_names.reference_ods_trusts).filter( + F.col("org_type").startswith("ACUTE") + ) df = ( - spark.read.table(table_names.default_apc) + hes_apc.join(acute_providers, F.col("provider") == F.col("org_code"), "semi") .filter(F.col("lsoa11").startswith("E")) .filter(F.col("lsoa11") != "E99999999") - .join(lsoa11_to_lad23, F.col("lsoa11") == F.col("lsoa11cd")) # get unique patients; more closely related to OHIDs logic # (https://tinyurl.com/ohid-nhs-acute-catchments) .select("fyear", "provider", "lad23cd", "age", "sex", "person_id") diff --git a/src/nhp/data/reference/trust_types.py b/src/nhp/data/reference/trust_types.py index 4aaea3d2..e2234014 100644 --- a/src/nhp/data/reference/trust_types.py +++ b/src/nhp/data/reference/trust_types.py @@ -42,7 +42,7 @@ def get_eric_links(): links = [a.attrs["href"] for a in div.find_all("a") if re.match(pattern, a.text)] # ensure the links are sorted in year order - return sorted([(year_to_fyear(i[-7:-3]), i) for i in links]) + return sorted([(year_to_fyear(i[-7:-3]), i) for i in links if isinstance(i, str)]) def get_eric_trust_data(year: int, link: str) -> pd.DataFrame: diff --git a/src/nhp/data/table_names.py b/src/nhp/data/table_names.py deleted file mode 100644 index 4fd4fe63..00000000 --- a/src/nhp/data/table_names.py +++ /dev/null @@ -1,187 +0,0 @@ -from dataclasses import dataclass - -# TODO: this should be set via an environment variable or similar when deploying asset bundles -environment = "mlcsu" # mlcsu / udal - - -@dataclass -class TableNames: - # -------------------------------------------------------------------------- - # hes tables - # -------------------------------------------------------------------------- - hes_aae: str - hes_aae_diagnoses: str - hes_aae_investigations: str - hes_aae_treatments: str - # --- - hes_apc: str - hes_apc_procedures: str - hes_apc_diagnoses: str - # --- - hes_ecds: str - # --- - hes_opa: str - hes_opa_procedures: str - hes_opa_diagnoses: str - # -------------------------------------------------------------------------- - # population projections tables - # -------------------------------------------------------------------------- - population_projections_save_path: str - population_projections_births: str - population_projections_demographics: str - # -------------------------------------------------------------------------- - # reference tables - # -------------------------------------------------------------------------- - reference_ccg_to_icb: str - reference_icb_catchments: str - reference_icd10_codes: str - reference_ods_trusts: str - reference_population_by_imd_decile: str - reference_provider_main_icb: str - reference_tretspef_grouping: str - reference_lsoa11_to_lsoa21: str - reference_lsoa21_to_lad23: str - reference_lsoa11_to_lad23: str - reference_pop_by_lsoa21: str - reference_provider_lad23_splits: str - reference_pop_by_provider: str - reference_pop_by_lad23: str - reference_lad22_to_lad23: str - # TODO: convert these to tables - reference_day_procedures_code_list: str - reference_frailty_risk_scores: str - reference_lad11_to_lad23_lookup: str - reference_lsoa11_to_imd19: str - reference_pop_by_lsoa: str - reference_population_by_year: str - reference_trust_types: str - # -------------------------------------------------------------------------- - # raw data - # -------------------------------------------------------------------------- - raw_data_apc: str - raw_data_apc_mitigators: str - raw_data_ecds: str - raw_data_opa: str - raw_data_opa_mitigators: str - # -------------------------------------------------------------------------- - # aggregated data - # -------------------------------------------------------------------------- - aggregated_data_ecds: str - aggregated_data_opa: str - # -------------------------------------------------------------------------- - # inputs data - # -------------------------------------------------------------------------- - inputs_catchments: str - inputs_save_path: str - # -------------------------------------------------------------------------- - # default data - # -------------------------------------------------------------------------- - default_apc: str - default_apc_mitigators: str - default_ecds: str - default_opa: str - default_hsa_activity_tables_provider: str - default_hsa_activity_tables_icb: str - default_hsa_activity_tables_national: str - default_inequalities: str - # -------------------------------------------------------------------------- - # model data - # -------------------------------------------------------------------------- - model_data_path: str - # -------------------------------------------------------------------------- - - -table_names: TableNames -match environment: - case "mlcsu": - table_names = TableNames( - # -------------------------------------------------------------------------- - # hes tables - # -------------------------------------------------------------------------- - hes_aae="hes.silver.aae", - hes_aae_diagnoses="hes.silver.aae_diagnoses", - hes_aae_investigations="hes.silver.aae_investigations", - hes_aae_treatments="hes.silver.aae_treatments", - # --- - hes_apc="hes.silver.apc", - hes_apc_procedures="hes.silver.apc_procedures", - hes_apc_diagnoses="hes.silver.apc_diagnoses", - # --- - hes_ecds="hes.silver.ecds", - # --- - hes_opa="hes.silver.opa", - hes_opa_procedures="hes.silver.opa_procedures", - hes_opa_diagnoses="hes.silver.opa_diagnoses", - # -------------------------------------------------------------------------- - # population projections tables - # -------------------------------------------------------------------------- - population_projections_save_path="/Volumes/nhp/population_projections/files", - population_projections_births="nhp.population_projections.births", - population_projections_demographics="nhp.population_projections.demographics", - # -------------------------------------------------------------------------- - # reference tables - # -------------------------------------------------------------------------- - reference_ccg_to_icb="strategyunit.reference.ccg_to_icb", - reference_icb_catchments="nhp.reference.icb_catchments", - reference_icd10_codes="strategyunit.reference.icd10_codes", - reference_ods_trusts="nhp.reference.ods_trusts", - reference_population_by_imd_decile="nhp.reference.population_by_imd_decile", - reference_provider_main_icb="nhp.reference.provider_main_icb", - reference_tretspef_grouping="nhp.reference.tretspef_grouping", - reference_lsoa11_to_lsoa21="nhp.reference.lsoa11_to_lsoa21", - reference_lsoa21_to_lad23="nhp.reference.lsoa21_to_lad23", - reference_pop_by_lsoa21="nhp.reference.pop_by_lsoa21", - reference_provider_lad23_splits="nhp.reference.provider_lad23_splits", - reference_pop_by_provider="nhp.reference.pop_by_provider", - reference_pop_by_lad23="nhp.reference.pop_by_lad23", - reference_lsoa11_to_lad23="nhp.reference.lsoa11_to_lad23", - reference_lad22_to_lad23="nhp.reference.lad22_to_lad23", - # --- - reference_day_procedures_code_list="/Volumes/nhp/reference/files/day_procedures.json", - reference_frailty_risk_scores="/Volumes/nhp/reference/files/frailty_risk_scores.csv", - reference_lad11_to_lad23_lookup="/Volumes/nhp/reference/files/lad11_to_lad23_lookup.csv", - reference_lsoa11_to_imd19="strategyunit.reference.lsoa11_to_imd19", - reference_pop_by_lsoa="/Volumes/strategyunit/reference/files/sape22_pop_by_lsoa.csv", - reference_population_by_year="/Volumes/nhp/reference/files/population_by_year.parquet", - reference_trust_types="/Volumes/nhp/reference/files/trust_types.parquet", - # -------------------------------------------------------------------------- - # raw data - # -------------------------------------------------------------------------- - raw_data_apc="nhp.raw_data.apc", - raw_data_apc_mitigators="nhp.raw_data.apc_mitigators", - raw_data_ecds="nhp.raw_data.ecds", - raw_data_opa="nhp.raw_data.opa", - raw_data_opa_mitigators="nhp.raw_data.opa_mitigators", - # -------------------------------------------------------------------------- - # aggregated data - # -------------------------------------------------------------------------- - aggregated_data_ecds="nhp.aggregated_data.ecds", - aggregated_data_opa="nhp.aggregated_data.opa", - # -------------------------------------------------------------------------- - # inputs data - # -------------------------------------------------------------------------- - inputs_catchments="nhp.reference.inputs_catchments", - inputs_save_path="/Volumes/nhp/inputs_data/files/dev", - # -------------------------------------------------------------------------- - # default data - # -------------------------------------------------------------------------- - default_apc="nhp.default.apc", - default_apc_mitigators="nhp.default.apc_mitigators", - default_ecds="nhp.default.ecds", - default_opa="nhp.default.opa", - default_hsa_activity_tables_provider="nhp.default.hsa_activity_tables", - default_hsa_activity_tables_icb="nhp.default.hsa_activity_tables_icb", - default_hsa_activity_tables_national="nhp.default.hsa_activity_tables_national", - default_inequalities="nhp.default.inequalities", - # -------------------------------------------------------------------------- - # model data - # -------------------------------------------------------------------------- - model_data_path="/Volumes/nhp/model_data/files", - # -------------------------------------------------------------------------- - ) - case "udal": - raise NotImplementedError( - "Table names for UDAL environment are not yet defined." - ) - case _: - raise ValueError(f"Unknown environment: {environment}") diff --git a/src/nhp/data/table_names/__init__.py b/src/nhp/data/table_names/__init__.py new file mode 100644 index 00000000..5712d282 --- /dev/null +++ b/src/nhp/data/table_names/__init__.py @@ -0,0 +1,18 @@ +from nhp.data.table_names.mlcsu import mlcsu +from nhp.data.table_names.table_names import TableNames +from nhp.data.table_names.udal import udal + +# TODO: this should be set via an environment variable or similar when deploying asset bundles +environment = "udal" # mlcsu / udal + + +_selected_table_names: TableNames +match environment: + case "mlcsu": + _selected_table_names = mlcsu + case "udal": + _selected_table_names = udal + case _: + raise ValueError(f"Unknown environment: {environment}") + +table_names = _selected_table_names diff --git a/src/nhp/data/table_names/mlcsu.py b/src/nhp/data/table_names/mlcsu.py new file mode 100644 index 00000000..a9fec16c --- /dev/null +++ b/src/nhp/data/table_names/mlcsu.py @@ -0,0 +1,85 @@ +from nhp.data.table_names.table_names import TableNames + +mlcsu = TableNames( + # -------------------------------------------------------------------------- + # hes tables + # -------------------------------------------------------------------------- + hes_aae="hes.silver.aae", + hes_aae_diagnoses="hes.silver.aae_diagnoses", + hes_aae_investigations="hes.silver.aae_investigations", + hes_aae_treatments="hes.silver.aae_treatments", + # --- + hes_apc="hes.silver.apc", + hes_apc_procedures="hes.silver.apc_procedures", + hes_apc_diagnoses="hes.silver.apc_diagnoses", + # --- + hes_ecds="hes.silver.ecds", + # --- + hes_opa="hes.silver.opa", + hes_opa_procedures="hes.silver.opa_procedures", + hes_opa_diagnoses="hes.silver.opa_diagnoses", + # -------------------------------------------------------------------------- + # population projections tables + # -------------------------------------------------------------------------- + population_projections_save_path="/Volumes/nhp/population_projections/files", + population_projections_births="nhp.population_projections.births", + population_projections_demographics="nhp.population_projections.demographics", + # -------------------------------------------------------------------------- + # reference tables + # -------------------------------------------------------------------------- + reference_ccg_to_icb="strategyunit.reference.ccg_to_icb", + reference_icb_catchments="nhp.reference.icb_catchments", + reference_icd10_codes="strategyunit.reference.icd10_codes", + reference_ods_trusts="nhp.reference.ods_trusts", + reference_population_by_imd_decile="nhp.reference.population_by_imd_decile", + reference_provider_main_icb="nhp.reference.provider_main_icb", + reference_tretspef_grouping="nhp.reference.tretspef_grouping", + reference_lsoa11_to_lsoa21="nhp.reference.lsoa11_to_lsoa21", + reference_lsoa21_to_lad23="nhp.reference.lsoa21_to_lad23", + reference_pop_by_lsoa21="nhp.reference.pop_by_lsoa21", + reference_provider_lad23_splits="nhp.reference.provider_lad23_splits", + reference_pop_by_provider="nhp.reference.pop_by_provider", + reference_pop_by_lad23="nhp.reference.pop_by_lad23", + reference_lsoa11_to_lad23="nhp.reference.lsoa11_to_lad23", + reference_lad22_to_lad23="nhp.reference.lad22_to_lad23", + # --- + reference_day_procedures_code_list="/Volumes/nhp/reference/files/day_procedures.json", + reference_frailty_risk_scores="/Volumes/nhp/reference/files/frailty_risk_scores.csv", + reference_lsoa11_to_imd19="strategyunit.reference.lsoa11_to_imd19", + reference_pop_by_lsoa="/Volumes/strategyunit/reference/files/sape22_pop_by_lsoa.csv", + reference_trust_types="/Volumes/nhp/reference/files/trust_types.parquet", + # -------------------------------------------------------------------------- + # raw data + # -------------------------------------------------------------------------- + raw_data_apc="nhp.raw_data.apc", + raw_data_apc_mitigators="nhp.raw_data.apc_mitigators", + raw_data_ecds="nhp.raw_data.ecds", + raw_data_opa="nhp.raw_data.opa", + raw_data_opa_mitigators="nhp.raw_data.opa_mitigators", + # -------------------------------------------------------------------------- + # aggregated data + # -------------------------------------------------------------------------- + aggregated_data_ecds="nhp.aggregated_data.ecds", + aggregated_data_opa="nhp.aggregated_data.opa", + # -------------------------------------------------------------------------- + # inputs data + # -------------------------------------------------------------------------- + inputs_catchments="nhp.reference.inputs_catchments", + inputs_save_path="/Volumes/nhp/inputs_data/files/dev", + # -------------------------------------------------------------------------- + # default data + # -------------------------------------------------------------------------- + default_apc="nhp.default.apc", + default_apc_mitigators="nhp.default.apc_mitigators", + default_ecds="nhp.default.ecds", + default_opa="nhp.default.opa", + default_hsa_activity_tables_provider="nhp.default.hsa_activity_tables", + default_hsa_activity_tables_icb="nhp.default.hsa_activity_tables_icb", + default_hsa_activity_tables_national="nhp.default.hsa_activity_tables_national", + default_inequalities="nhp.default.inequalities", + # -------------------------------------------------------------------------- + # model data + # -------------------------------------------------------------------------- + model_data_path="/Volumes/nhp/model_data/files", + # -------------------------------------------------------------------------- +) diff --git a/src/nhp/data/table_names/table_names.py b/src/nhp/data/table_names/table_names.py new file mode 100644 index 00000000..f236a5bf --- /dev/null +++ b/src/nhp/data/table_names/table_names.py @@ -0,0 +1,86 @@ +from dataclasses import dataclass + + +@dataclass +class TableNames: + # -------------------------------------------------------------------------- + # hes tables + # -------------------------------------------------------------------------- + hes_aae: str + hes_aae_diagnoses: str + hes_aae_investigations: str + hes_aae_treatments: str + # --- + hes_apc: str + hes_apc_procedures: str + hes_apc_diagnoses: str + # --- + hes_ecds: str + # --- + hes_opa: str + hes_opa_procedures: str + hes_opa_diagnoses: str + # -------------------------------------------------------------------------- + # population projections tables + # -------------------------------------------------------------------------- + population_projections_save_path: str + population_projections_births: str + population_projections_demographics: str + # -------------------------------------------------------------------------- + # reference tables + # -------------------------------------------------------------------------- + reference_ccg_to_icb: str + reference_icb_catchments: str + reference_icd10_codes: str + reference_ods_trusts: str + reference_population_by_imd_decile: str + reference_provider_main_icb: str + reference_tretspef_grouping: str + reference_lsoa11_to_lsoa21: str + reference_lsoa21_to_lad23: str + reference_lsoa11_to_lad23: str + reference_pop_by_lsoa21: str + reference_provider_lad23_splits: str + reference_pop_by_provider: str + reference_pop_by_lad23: str + reference_lad22_to_lad23: str + # TODO: convert these to tables + reference_day_procedures_code_list: str + reference_frailty_risk_scores: str + reference_lsoa11_to_imd19: str + reference_pop_by_lsoa: str + reference_trust_types: str + # -------------------------------------------------------------------------- + # raw data + # -------------------------------------------------------------------------- + raw_data_apc: str + raw_data_apc_mitigators: str + raw_data_ecds: str + raw_data_opa: str + raw_data_opa_mitigators: str + # -------------------------------------------------------------------------- + # aggregated data + # -------------------------------------------------------------------------- + aggregated_data_ecds: str + aggregated_data_opa: str + # -------------------------------------------------------------------------- + # inputs data + # -------------------------------------------------------------------------- + inputs_catchments: str + inputs_save_path: str + # -------------------------------------------------------------------------- + # default data + # -------------------------------------------------------------------------- + default_apc: str + default_apc_mitigators: str + default_ecds: str + default_opa: str + default_hsa_activity_tables_provider: str + default_hsa_activity_tables_icb: str + default_hsa_activity_tables_national: str + default_inequalities: str + # -------------------------------------------------------------------------- + # model data + # -------------------------------------------------------------------------- + model_data_path: str + # -------------------------------------------------------------------------- diff --git a/src/nhp/data/table_names/udal.py b/src/nhp/data/table_names/udal.py new file mode 100644 index 00000000..0d1320f7 --- /dev/null +++ b/src/nhp/data/table_names/udal.py @@ -0,0 +1,85 @@ +from nhp.data.table_names.table_names import TableNames + +udal = TableNames( + # -------------------------------------------------------------------------- + # hes tables + # -------------------------------------------------------------------------- + hes_aae="udal_lake_mart.newhospitalprogramme.hes_aae", + hes_aae_diagnoses="udal_lake_mart.newhospitalprogramme.hes_aae_diagnoses", + hes_aae_investigations="udal_lake_mart.newhospitalprogramme.hes_aae_investigations", + hes_aae_treatments="udal_lake_mart.newhospitalprogramme.hes_aae_treatments", + # --- + hes_apc="udal_lake_mart.newhospitalprogramme.hes_apc", + hes_apc_procedures="udal_lake_mart.newhospitalprogramme.hes_apc_procedures", + hes_apc_diagnoses="udal_lake_mart.newhospitalprogramme.hes_apc_diagnoses", + # --- + hes_ecds="udal_silver_restricted.mesh_ecds.ec_core", + # --- + hes_opa="udal_lake_mart.newhospitalprogramme.hes_opa", + hes_opa_procedures="udal_lake_mart.newhospitalprogramme.hes_opa_procedures", + hes_opa_diagnoses="udal_lake_mart.newhospitalprogramme.hes_opa_diagnoses", + # -------------------------------------------------------------------------- + # population projections tables + # -------------------------------------------------------------------------- + population_projections_save_path="/Volumes/udal_lake_mart/newhospitalprogramme/files/population_projections", + population_projections_births="udal_lake_mart.newhospitalprogramme.population_projections_births", + population_projections_demographics="udal_lake_mart.newhospitalprogramme.population_projections_demographics", + # -------------------------------------------------------------------------- + # reference tables + # -------------------------------------------------------------------------- + reference_ccg_to_icb="udal_lake_mart.newhospitalprogramme.reference_ccg_to_icb", + reference_icb_catchments="udal_lake_mart.newhospitalprogramme.reference_icb_catchments", + reference_icd10_codes="udal_lake_mart.newhospitalprogramme.reference_icd10_codes", + reference_ods_trusts="udal_lake_mart.newhospitalprogramme.reference_ods_trusts", + reference_population_by_imd_decile="udal_lake_mart.newhospitalprogramme.reference_population_by_imd_decile", + reference_provider_main_icb="udal_lake_mart.newhospitalprogramme.reference_provider_main_icb", + reference_tretspef_grouping="udal_lake_mart.newhospitalprogramme.reference_tretspef_grouping", + reference_lsoa11_to_lsoa21="udal_lake_mart.newhospitalprogramme.reference_lsoa11_to_lsoa21", + reference_lsoa21_to_lad23="udal_lake_mart.newhospitalprogramme.reference_lsoa21_to_lad23", + reference_pop_by_lsoa21="udal_lake_mart.newhospitalprogramme.reference_pop_by_lsoa21", + reference_provider_lad23_splits="udal_lake_mart.newhospitalprogramme.reference_provider_lad23_splits", + reference_pop_by_provider="udal_lake_mart.newhospitalprogramme.reference_pop_by_provider", + reference_pop_by_lad23="udal_lake_mart.newhospitalprogramme.reference_pop_by_lad23", + reference_lsoa11_to_lad23="udal_lake_mart.newhospitalprogramme.reference_lsoa11_to_lad23", + reference_lad22_to_lad23="udal_lake_mart.newhospitalprogramme.reference_lad22_to_lad23", + # --- + reference_day_procedures_code_list="/Volumes/udal_lake_mart/newhospitalprogramme/files/reference/day_procedures.json", + reference_frailty_risk_scores="/Volumes/udal_lake_mart/newhospitalprogramme/files/reference/frailty_risk_scores.csv", + reference_lsoa11_to_imd19="udal_lake_mart.newhospitalprogramme.reference_lsoa11_to_imd19", + reference_pop_by_lsoa="/Volumes/udal_lake_mart/newhospitalprogramme/files/reference/sape22_pop_by_lsoa.csv", + reference_trust_types="/Volumes/udal_lake_mart/newhospitalprogramme/files/reference/trust_types.parquet", + # -------------------------------------------------------------------------- + # raw data + # -------------------------------------------------------------------------- + raw_data_apc="udal_lake_mart.newhospitalprogramme.raw_data_apc", + raw_data_apc_mitigators="udal_lake_mart.newhospitalprogramme.raw_data_apc_mitigators", + raw_data_ecds="udal_lake_mart.newhospitalprogramme.raw_data_ecds", + raw_data_opa="udal_lake_mart.newhospitalprogramme.raw_data_opa", + raw_data_opa_mitigators="udal_lake_mart.newhospitalprogramme.raw_data_opa_mitigators", + # -------------------------------------------------------------------------- + # aggregated data + # -------------------------------------------------------------------------- + aggregated_data_ecds="udal_lake_mart.newhospitalprogramme.aggregated_data_ecds", + aggregated_data_opa="udal_lake_mart.newhospitalprogramme.aggregated_data_opa", + # -------------------------------------------------------------------------- + # inputs data + # -------------------------------------------------------------------------- + inputs_catchments="udal_lake_mart.newhospitalprogramme.reference_inputs_catchments", + inputs_save_path="/Volumes/udal_lake_mart/newhospitalprogramme/files/inputs_data/dev", + # -------------------------------------------------------------------------- + # default data + # -------------------------------------------------------------------------- + default_apc="udal_lake_mart.newhospitalprogramme.default_apc", + default_apc_mitigators="udal_lake_mart.newhospitalprogramme.default_apc_mitigators", + default_ecds="udal_lake_mart.newhospitalprogramme.default_ecds", + default_opa="udal_lake_mart.newhospitalprogramme.default_opa", + default_hsa_activity_tables_provider="udal_lake_mart.newhospitalprogramme.default_hsa_activity_tables", + default_hsa_activity_tables_icb="udal_lake_mart.newhospitalprogramme.default_hsa_activity_tables_icb", + default_hsa_activity_tables_national="udal_lake_mart.newhospitalprogramme.default_hsa_activity_tables_national", + default_inequalities="udal_lake_mart.newhospitalprogramme.default_inequalities", + # -------------------------------------------------------------------------- + # model data + # -------------------------------------------------------------------------- + model_data_path="/Volumes/udal_lake_mart/newhospitalprogramme/files/model_data", + # -------------------------------------------------------------------------- +) diff --git a/uv.lock b/uv.lock index 0b2f40d7..633ae8b5 100644 --- a/uv.lock +++ b/uv.lock @@ -47,6 +47,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592, upload-time = "2026-01-06T11:45:19.497Z" }, ] +[[package]] +name = "azure-core" +version = "1.38.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "requests" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c8/29/9641b73248745774a52c7ce7f965ed1febbdea787ec21caad3ae6891d18a/azure_core-1.38.3.tar.gz", hash = "sha256:a7931fd445cb4af8802c6f39c6a326bbd1e34b115846550a8245fa656ead6f8e", size = 367267, upload-time = "2026-03-12T20:28:21.122Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/3d/ac86083efa45a439d0bbfb7947615227813d368b9e1e93d23fd30de6fec0/azure_core-1.38.3-py3-none-any.whl", hash = "sha256:bf59d29765bf4748ab9edf25f98a30b7ea9797f43e367c06d846a30b29c1f845", size = 218231, upload-time = "2026-03-12T20:28:22.462Z" }, +] + +[[package]] +name = "azure-storage-blob" +version = "12.28.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "azure-core" }, + { name = "cryptography" }, + { name = "isodate" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/24/072ba8e27b0e2d8fec401e9969b429d4f5fc4c8d4f0f05f4661e11f7234a/azure_storage_blob-12.28.0.tar.gz", hash = "sha256:e7d98ea108258d29aa0efbfd591b2e2075fa1722a2fae8699f0b3c9de11eff41", size = 604225, upload-time = "2026-01-06T23:48:57.282Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d8/3a/6ef2047a072e54e1142718d433d50e9514c999a58f51abfff7902f3a72f8/azure_storage_blob-12.28.0-py3-none-any.whl", hash = "sha256:00fb1db28bf6a7b7ecaa48e3b1d5c83bfadacc5a678b77826081304bd87d6461", size = 431499, upload-time = "2026-01-06T23:48:58.995Z" }, +] + [[package]] name = "beautifulsoup4" version = "4.14.3" @@ -601,7 +629,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" }, { url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" }, { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" }, - { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" }, { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" }, { url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" }, { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" }, @@ -609,7 +636,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/2f/28592176381b9ab2cafa12829ba7b472d177f3acc35d8fbcf3673d966fff/greenlet-3.3.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:a1e41a81c7e2825822f4e068c48cb2196002362619e2d70b148f20a831c00739", size = 275140, upload-time = "2025-12-04T14:23:01.282Z" }, { url = "https://files.pythonhosted.org/packages/2c/80/fbe937bf81e9fca98c981fe499e59a3f45df2a04da0baa5c2be0dca0d329/greenlet-3.3.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f515a47d02da4d30caaa85b69474cec77b7929b2e936ff7fb853d42f4bf8808", size = 599219, upload-time = "2025-12-04T14:50:08.309Z" }, { url = "https://files.pythonhosted.org/packages/c2/ff/7c985128f0514271b8268476af89aee6866df5eec04ac17dcfbc676213df/greenlet-3.3.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d2d9fd66bfadf230b385fdc90426fcd6eb64db54b40c495b72ac0feb5766c54", size = 610211, upload-time = "2025-12-04T14:57:43.968Z" }, - { url = "https://files.pythonhosted.org/packages/79/07/c47a82d881319ec18a4510bb30463ed6891f2ad2c1901ed5ec23d3de351f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30a6e28487a790417d036088b3bcb3f3ac7d8babaa7d0139edbaddebf3af9492", size = 624311, upload-time = "2025-12-04T15:07:14.697Z" }, { url = "https://files.pythonhosted.org/packages/fd/8e/424b8c6e78bd9837d14ff7df01a9829fc883ba2ab4ea787d4f848435f23f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:087ea5e004437321508a8d6f20efc4cfec5e3c30118e1417ea96ed1d93950527", size = 612833, upload-time = "2025-12-04T14:26:03.669Z" }, { url = "https://files.pythonhosted.org/packages/b5/ba/56699ff9b7c76ca12f1cdc27a886d0f81f2189c3455ff9f65246780f713d/greenlet-3.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ab97cf74045343f6c60a39913fa59710e4bd26a536ce7ab2397adf8b27e67c39", size = 1567256, upload-time = "2025-12-04T15:04:25.276Z" }, { url = "https://files.pythonhosted.org/packages/1e/37/f31136132967982d698c71a281a8901daf1a8fbab935dce7c0cf15f942cc/greenlet-3.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5375d2e23184629112ca1ea89a53389dddbffcf417dad40125713d88eb5f96e8", size = 1636483, upload-time = "2025-12-04T14:27:30.804Z" }, @@ -617,7 +643,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d7/7c/f0a6d0ede2c7bf092d00bc83ad5bafb7e6ec9b4aab2fbdfa6f134dc73327/greenlet-3.3.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:60c2ef0f578afb3c8d92ea07ad327f9a062547137afe91f38408f08aacab667f", size = 275671, upload-time = "2025-12-04T14:23:05.267Z" }, { url = "https://files.pythonhosted.org/packages/44/06/dac639ae1a50f5969d82d2e3dd9767d30d6dbdbab0e1a54010c8fe90263c/greenlet-3.3.0-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a5d554d0712ba1de0a6c94c640f7aeba3f85b3a6e1f2899c11c2c0428da9365", size = 646360, upload-time = "2025-12-04T14:50:10.026Z" }, { url = "https://files.pythonhosted.org/packages/e0/94/0fb76fe6c5369fba9bf98529ada6f4c3a1adf19e406a47332245ef0eb357/greenlet-3.3.0-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3a898b1e9c5f7307ebbde4102908e6cbfcb9ea16284a3abe15cab996bee8b9b3", size = 658160, upload-time = "2025-12-04T14:57:45.41Z" }, - { url = "https://files.pythonhosted.org/packages/93/79/d2c70cae6e823fac36c3bbc9077962105052b7ef81db2f01ec3b9bf17e2b/greenlet-3.3.0-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:dcd2bdbd444ff340e8d6bdf54d2f206ccddbb3ccfdcd3c25bf4afaa7b8f0cf45", size = 671388, upload-time = "2025-12-04T15:07:15.789Z" }, { url = "https://files.pythonhosted.org/packages/b8/14/bab308fc2c1b5228c3224ec2bf928ce2e4d21d8046c161e44a2012b5203e/greenlet-3.3.0-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5773edda4dc00e173820722711d043799d3adb4f01731f40619e07ea2750b955", size = 660166, upload-time = "2025-12-04T14:26:05.099Z" }, { url = "https://files.pythonhosted.org/packages/4b/d2/91465d39164eaa0085177f61983d80ffe746c5a1860f009811d498e7259c/greenlet-3.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ac0549373982b36d5fd5d30beb8a7a33ee541ff98d2b502714a09f1169f31b55", size = 1615193, upload-time = "2025-12-04T15:04:27.041Z" }, { url = "https://files.pythonhosted.org/packages/42/1b/83d110a37044b92423084d52d5d5a3b3a73cafb51b547e6d7366ff62eff1/greenlet-3.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d198d2d977460358c3b3a4dc844f875d1adb33817f0613f663a656f463764ccc", size = 1683653, upload-time = "2025-12-04T14:27:32.366Z" }, @@ -625,7 +650,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a0/66/bd6317bc5932accf351fc19f177ffba53712a202f9df10587da8df257c7e/greenlet-3.3.0-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:d6ed6f85fae6cdfdb9ce04c9bf7a08d666cfcfb914e7d006f44f840b46741931", size = 282638, upload-time = "2025-12-04T14:25:20.941Z" }, { url = "https://files.pythonhosted.org/packages/30/cf/cc81cb030b40e738d6e69502ccbd0dd1bced0588e958f9e757945de24404/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d9125050fcf24554e69c4cacb086b87b3b55dc395a8b3ebe6487b045b2614388", size = 651145, upload-time = "2025-12-04T14:50:11.039Z" }, { url = "https://files.pythonhosted.org/packages/9c/ea/1020037b5ecfe95ca7df8d8549959baceb8186031da83d5ecceff8b08cd2/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:87e63ccfa13c0a0f6234ed0add552af24cc67dd886731f2261e46e241608bee3", size = 654236, upload-time = "2025-12-04T14:57:47.007Z" }, - { url = "https://files.pythonhosted.org/packages/69/cc/1e4bae2e45ca2fa55299f4e85854606a78ecc37fead20d69322f96000504/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2662433acbca297c9153a4023fe2161c8dcfdcc91f10433171cf7e7d94ba2221", size = 662506, upload-time = "2025-12-04T15:07:16.906Z" }, { url = "https://files.pythonhosted.org/packages/57/b9/f8025d71a6085c441a7eaff0fd928bbb275a6633773667023d19179fe815/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3c6e9b9c1527a78520357de498b0e709fb9e2f49c3a513afd5a249007261911b", size = 653783, upload-time = "2025-12-04T14:26:06.225Z" }, { url = "https://files.pythonhosted.org/packages/f6/c7/876a8c7a7485d5d6b5c6821201d542ef28be645aa024cfe1145b35c120c1/greenlet-3.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:286d093f95ec98fdd92fcb955003b8a3d054b4e2cab3e2707a5039e7b50520fd", size = 1614857, upload-time = "2025-12-04T15:04:28.484Z" }, { url = "https://files.pythonhosted.org/packages/4f/dc/041be1dff9f23dac5f48a43323cd0789cb798342011c19a248d9c9335536/greenlet-3.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c10513330af5b8ae16f023e8ddbfb486ab355d04467c4679c5cfe4659975dd9", size = 1676034, upload-time = "2025-12-04T14:27:33.531Z" }, @@ -737,6 +761,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, ] +[[package]] +name = "isodate" +version = "0.7.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705, upload-time = "2024-10-08T23:04:11.5Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320, upload-time = "2024-10-08T23:04:09.501Z" }, +] + [[package]] name = "itsdangerous" version = "2.2.0" @@ -1133,6 +1166,7 @@ name = "nhp-data" version = "0.1.0" source = { editable = "." } dependencies = [ + { name = "azure-storage-blob" }, { name = "beautifulsoup4" }, { name = "databricks-connect" }, { name = "mlflow" }, @@ -1148,6 +1182,7 @@ dependencies = [ [package.metadata] requires-dist = [ + { name = "azure-storage-blob", specifier = ">=12.28.0" }, { name = "beautifulsoup4", specifier = ">=4.13.4" }, { name = "databricks-connect", specifier = ">=16.4" }, { name = "mlflow", specifier = ">=3.3.2" },