LCZ_coding/validate_extracted_data.py at main · Logic06183/LCZ_coding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
"""
Data Validation Script for Johannesburg Temperature Extraction
===============================================================

This script validates the extracted temperature data to ensure it meets
quality standards and is suitable for LCZ/UHI analysis.

Validation checks:
1. Data structure and format
2. Completeness (missing values)
3. Temperature ranges (physical plausibility)
4. Temporal continuity
5. Spatial coverage
6. LCZ4r compatibility
"""

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta
import json
import sys


class TemperatureDataValidator:
    """Validate extracted temperature data for LCZ/UHI analysis."""

    def __init__(self, csv_path: str):
        """
        Initialize validator with data path.

        Args:
            csv_path: Path to the extracted CSV file
        """
        self.csv_path = Path(csv_path)
        self.df = None
        self.validation_results = {
            'timestamp': datetime.now().isoformat(),
            'file': str(csv_path),
            'checks': {}
        }

    def load_data(self) -> bool:
        """Load and perform basic checks on the CSV file."""
        print("="*60)
        print("TEMPERATURE DATA VALIDATION")
        print("="*60)
        print(f"\nFile: {self.csv_path}")

        if not self.csv_path.exists():
            print(f"❌ ERROR: File not found: {self.csv_path}")
            return False

        try:
            self.df = pd.read_csv(self.csv_path)
            print("✓ File loaded successfully")
            return True
        except Exception as e:
            print(f"❌ ERROR loading file: {e}")
            return False

    def check_structure(self) -> bool:
        """Validate data structure and column names."""
        print("\n" + "-"*60)
        print("1. STRUCTURE CHECK")
        print("-"*60)

        required_columns = ['date', 'id', 'lat', 'long', 'temp']
        missing_columns = [col for col in required_columns if col not in self.df.columns]

        if missing_columns:
            print(f"❌ Missing columns: {missing_columns}")
            self.validation_results['checks']['structure'] = {
                'passed': False,
                'error': f"Missing columns: {missing_columns}"
            }
            return False

        print(f"✓ All required columns present: {required_columns}")
        print(f"  Total columns: {self.df.columns.tolist()}")
        print(f"  Total rows: {len(self.df)}")

        self.validation_results['checks']['structure'] = {
            'passed': True,
            'rows': len(self.df),
            'columns': self.df.columns.tolist()
        }
        return True

    def check_completeness(self) -> bool:
        """Check for missing values."""
        print("\n" + "-"*60)
        print("2. COMPLETENESS CHECK")
        print("-"*60)

        missing = self.df.isnull().sum()
        total_cells = len(self.df) * len(self.df.columns)
        missing_percent = (missing.sum() / total_cells) * 100

        print(f"Missing values per column:")
        for col in self.df.columns:
            if missing[col] > 0:
                pct = (missing[col] / len(self.df)) * 100
                print(f"  {col}: {missing[col]} ({pct:.1f}%)")

        if missing_percent > 5:
            print(f"\n⚠️  WARNING: {missing_percent:.1f}% of data is missing")
            passed = False
        elif missing_percent > 0:
            print(f"\n⚠️  {missing_percent:.1f}% of data is missing (acceptable)")
            passed = True
        else:
            print("\n✓ No missing values")
            passed = True

        self.validation_results['checks']['completeness'] = {
            'passed': passed,
            'missing_percent': round(missing_percent, 2),
            'missing_by_column': missing.to_dict()
        }
        return passed

    def check_temperature_ranges(self) -> bool:
        """Validate temperature values are physically plausible."""
        print("\n" + "-"*60)
        print("3. TEMPERATURE RANGE CHECK")
        print("-"*60)

        # Remove missing values for this check
        temp_valid = self.df['temp'].dropna()

        # Expected ranges for Johannesburg summer (January-February)
        expected_min = 10  # °C (cool night)
        expected_max = 40  # °C (hot day)

        min_temp = temp_valid.min()
        max_temp = temp_valid.max()
        mean_temp = temp_valid.mean()
        std_temp = temp_valid.std()

        print(f"Temperature statistics (°C):")
        print(f"  Min:  {min_temp:.2f}")
        print(f"  Max:  {max_temp:.2f}")
        print(f"  Mean: {mean_temp:.2f}")
        print(f"  Std:  {std_temp:.2f}")

        # Check for outliers
        outliers_low = temp_valid < expected_min
        outliers_high = temp_valid > expected_max

        passed = True

        if outliers_low.sum() > 0:
            print(f"\n⚠️  {outliers_low.sum()} values below expected minimum ({expected_min}°C)")
            print(f"     Lowest: {temp_valid[outliers_low].min():.2f}°C")
            if min_temp < 0:
                print(f"     ❌ WARNING: Temperatures below 0°C are unusual for Johannesburg summer")
                passed = False

        if outliers_high.sum() > 0:
            print(f"\n⚠️  {outliers_high.sum()} values above expected maximum ({expected_max}°C)")
            print(f"     Highest: {temp_valid[outliers_high].max():.2f}°C")
            if max_temp > 45:
                print(f"     ❌ WARNING: Temperatures above 45°C are unusual for Johannesburg")
                passed = False

        if passed and outliers_low.sum() == 0 and outliers_high.sum() == 0:
            print(f"\n✓ All temperatures within expected range ({expected_min}-{expected_max}°C)")

        self.validation_results['checks']['temperature_range'] = {
            'passed': passed,
            'min': float(min_temp),
            'max': float(max_temp),
            'mean': float(mean_temp),
            'std': float(std_temp),
            'outliers_low': int(outliers_low.sum()),
            'outliers_high': int(outliers_high.sum())
        }
        return passed

    def check_temporal_continuity(self) -> bool:
        """Check for temporal gaps in the data."""
        print("\n" + "-"*60)
        print("4. TEMPORAL CONTINUITY CHECK")
        print("-"*60)

        # Convert date column to datetime
        self.df['datetime'] = pd.to_datetime(self.df['date'])

        date_range = self.df['datetime'].max() - self.df['datetime'].min()
        print(f"Date range: {self.df['datetime'].min()} to {self.df['datetime'].max()}")
        print(f"Total span: {date_range.days} days, {date_range.seconds//3600} hours")

        # Check for each location
        locations = self.df['id'].unique()
        print(f"\nTemporal coverage per location:")

        passed = True
        for location in locations:
            loc_data = self.df[self.df['id'] == location].sort_values('datetime')
            n_records = len(loc_data)

            # Calculate expected number of records (hourly for date range)
            expected_records = int(date_range.total_seconds() / 3600) + 1

            coverage_pct = (n_records / expected_records) * 100

            print(f"  {location}: {n_records} records ({coverage_pct:.1f}% coverage)")

            if coverage_pct < 80:
                print(f"    ⚠️  WARNING: Low temporal coverage")
                passed = False

        if passed:
            print("\n✓ Temporal coverage is adequate for all locations")

        self.validation_results['checks']['temporal_continuity'] = {
            'passed': passed,
            'date_range_days': date_range.days,
            'locations_checked': len(locations)
        }
        return passed

    def check_spatial_coverage(self) -> bool:
        """Validate spatial coverage across urban gradient."""
        print("\n" + "-"*60)
        print("5. SPATIAL COVERAGE CHECK")
        print("-"*60)

        # Expected locations
        expected_locations = [
            'JHB_CBD', 'JHB_Sandton', 'JHB_Rosebank', 'JHB_Soweto',
            'JHB_Randburg', 'JHB_Midrand', 'JHB_Lanseria',
            'JHB_East_Rural', 'JHB_South_Rural', 'JHB_North_Rural'
        ]

        actual_locations = self.df['id'].unique().tolist()
        missing_locations = [loc for loc in expected_locations if loc not in actual_locations]

        print(f"Expected locations: {len(expected_locations)}")
        print(f"Actual locations: {len(actual_locations)}")

        if missing_locations:
            print(f"\n⚠️  Missing locations: {missing_locations}")
            passed = False
        else:
            print("\n✓ All expected locations present")
            passed = True

        # Check coordinate ranges (Johannesburg bounding box)
        lat_min, lat_max = self.df['lat'].min(), self.df['lat'].max()
        long_min, long_max = self.df['long'].min(), self.df['long'].max()

        print(f"\nCoordinate ranges:")
        print(f"  Latitude: {lat_min:.4f} to {lat_max:.4f}")
        print(f"  Longitude: {long_min:.4f} to {long_max:.4f}")

        # Expected Johannesburg bounds
        expected_bounds = {
            'lat': (-26.45, -25.95),
            'long': (27.75, 28.35)
        }

        if not (expected_bounds['lat'][0] <= lat_min <= lat_max <= expected_bounds['lat'][1]):
            print(f"  ⚠️  Latitude out of expected range: {expected_bounds['lat']}")
            passed = False

        if not (expected_bounds['long'][0] <= long_min <= long_max <= expected_bounds['long'][1]):
            print(f"  ⚠️  Longitude out of expected range: {expected_bounds['long']}")
            passed = False

        self.validation_results['checks']['spatial_coverage'] = {
            'passed': passed,
            'locations': actual_locations,
            'missing_locations': missing_locations,
            'lat_range': [float(lat_min), float(lat_max)],
            'long_range': [float(long_min), float(long_max)]
        }
        return passed

    def check_lcz4r_compatibility(self) -> bool:
        """Verify compatibility with LCZ4r R package format."""
        print("\n" + "-"*60)
        print("6. LCZ4R COMPATIBILITY CHECK")
        print("-"*60)

        # Check column names match LCZ4r expectations
        expected_format = {
            'date': 'datetime string',
            'id': 'station identifier',
            'lat': 'numeric latitude',
            'long': 'numeric longitude',
            'temp': 'numeric temperature'
        }

        print("Required format:")
        for col, desc in expected_format.items():
            print(f"  {col}: {desc}")

        # Check data types
        passed = True

        # Date should be parseable as datetime
        try:
            pd.to_datetime(self.df['date'])
            print("\n✓ Date column is valid datetime format")
        except:
            print("\n❌ Date column cannot be parsed as datetime")
            passed = False

        # id should be string
        if self.df['id'].dtype == 'object':
            print("✓ ID column is string type")
        else:
            print("⚠️  ID column should be string type")

        # lat, long, temp should be numeric
        for col in ['lat', 'long', 'temp']:
            if pd.api.types.is_numeric_dtype(self.df[col]):
                print(f"✓ {col} column is numeric")
            else:
                print(f"❌ {col} column is not numeric")
                passed = False

        # Check for reasonable number of unique dates per location
        dates_per_location = self.df.groupby('id')['date'].nunique().mean()
        print(f"\nAverage unique dates per location: {dates_per_location:.0f}")

        if dates_per_location < 100:
            print("⚠️  Low number of observations per location")
            print("   Consider extracting more data for robust analysis")

        self.validation_results['checks']['lcz4r_compatibility'] = {
            'passed': passed,
            'avg_dates_per_location': float(dates_per_location)
        }
        return passed

    def generate_summary(self):
        """Generate validation summary."""
        print("\n" + "="*60)
        print("VALIDATION SUMMARY")
        print("="*60)

        all_passed = all(
            check['passed'] for check in self.validation_results['checks'].values()
        )

        if all_passed:
            print("\n✓✓✓ ALL VALIDATION CHECKS PASSED ✓✓✓")
            print("\nThe data is ready for LCZ/UHI analysis with LCZ4r.")
        else:
            print("\n⚠️⚠️⚠️ SOME VALIDATION CHECKS FAILED ⚠️⚠️⚠️")
            print("\nReview the warnings above before proceeding.")
            print("The data may still be usable but requires careful interpretation.")

        print("\nCheck results:")
        for check_name, result in self.validation_results['checks'].items():
            status = "✓ PASS" if result['passed'] else "❌ FAIL"
            print(f"  {check_name.replace('_', ' ').title()}: {status}")

        print("\n" + "="*60)

        # Save validation report
        report_path = self.csv_path.parent / f"{self.csv_path.stem}_validation_report.json"
        with open(report_path, 'w') as f:
            json.dump(self.validation_results, f, indent=2)

        print(f"\nValidation report saved: {report_path}")

        return all_passed

    def run_validation(self) -> bool:
        """Run all validation checks."""
        if not self.load_data():
            return False

        checks = [
            self.check_structure(),
            self.check_completeness(),
            self.check_temperature_ranges(),
            self.check_temporal_continuity(),
            self.check_spatial_coverage(),
            self.check_lcz4r_compatibility()
        ]

        return self.generate_summary()


def main():
    """Main execution function."""
    if len(sys.argv) < 2:
        print("Usage: python validate_extracted_data.py <path_to_csv>")
        print("\nExample:")
        print("  python validate_extracted_data.py jhb_temperature_era5_20260211_120000.csv")
        return

    csv_path = sys.argv[1]

    validator = TemperatureDataValidator(csv_path)
    success = validator.run_validation()

    sys.exit(0 if success else 1)


if __name__ == "__main__":
    main()