880 lines
30 KiB
Python
880 lines
30 KiB
Python
"""Feature engineering + geospatial helpers for GeoCrop.
|
|
|
|
This module is shared by training (feature selection + scaling helpers)
|
|
AND inference (DEA STAC fetch + raster alignment + smoothing).
|
|
|
|
IMPORTANT: This implementation exactly replicates train.py feature engineering:
|
|
- Savitzky-Golay smoothing (window=5, polyorder=2) with 0-interpolation
|
|
- Phenology metrics (amplitude, AUC, peak_timestep, max_slope)
|
|
- Harmonic/Fourier features (1st and 2nd order sin/cos)
|
|
- Seasonal window statistics (Early: Oct-Dec, Peak: Jan-Mar, Late: Apr-Jun)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
from dataclasses import dataclass
|
|
from datetime import date
|
|
from typing import Dict, Iterable, List, Optional, Tuple
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
# Raster / geo
|
|
import rasterio
|
|
from rasterio.enums import Resampling
|
|
|
|
|
|
# ==========================================
|
|
# Training helpers
|
|
# ==========================================
|
|
|
|
def drop_junk_columns(df: pd.DataFrame, junk_cols: List[str]) -> pd.DataFrame:
|
|
"""Drop junk/spatial columns that would cause data leakage.
|
|
|
|
Matches train.py junk_cols: ['.geo', 'system:index', 'latitude', 'longitude',
|
|
'lat', 'lon', 'ID', 'parent_id', 'batch_id', 'is_syn']
|
|
"""
|
|
cols_to_drop = [c for c in junk_cols if c in df.columns]
|
|
return df.drop(columns=cols_to_drop)
|
|
|
|
|
|
def scout_feature_selection(
|
|
X_train: pd.DataFrame,
|
|
y_train: np.ndarray,
|
|
n_estimators: int = 100,
|
|
random_state: int = 42,
|
|
) -> List[str]:
|
|
"""Scout LightGBM feature selection (keeps non-zero importances)."""
|
|
import lightgbm as lgb
|
|
|
|
lgbm = lgb.LGBMClassifier(n_estimators=n_estimators, random_state=random_state, verbose=-1)
|
|
lgbm.fit(X_train, y_train)
|
|
|
|
importances = pd.DataFrame(
|
|
{"Feature": X_train.columns, "Importance": lgbm.feature_importances_}
|
|
).sort_values("Importance", ascending=False)
|
|
|
|
selected = importances[importances["Importance"] > 0]["Feature"].tolist()
|
|
if not selected:
|
|
# Fallback: keep everything (better than breaking training)
|
|
selected = list(X_train.columns)
|
|
return selected
|
|
|
|
|
|
def scale_numeric_features(
|
|
X_train: pd.DataFrame,
|
|
X_test: pd.DataFrame,
|
|
):
|
|
"""Scale only numeric columns, return (X_train_scaled, X_test_scaled, scaler).
|
|
|
|
Uses StandardScaler (matches train.py).
|
|
"""
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
scaler = StandardScaler()
|
|
|
|
num_cols = X_train.select_dtypes(include=[np.number]).columns
|
|
X_train_scaled = X_train.copy()
|
|
X_test_scaled = X_test.copy()
|
|
|
|
X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
|
|
X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])
|
|
|
|
return X_train_scaled, X_test_scaled, scaler
|
|
|
|
|
|
# ==========================================
|
|
# INFERENCE-ONLY FEATURE ENGINEERING
|
|
# These functions replicate train.py for raster-based inference
|
|
# ==========================================
|
|
|
|
def apply_smoothing_to_rasters(
|
|
timeseries_dict: Dict[str, np.ndarray],
|
|
dates: List[str]
|
|
) -> Dict[str, np.ndarray]:
|
|
"""Apply Savitzky-Golay smoothing to time-series raster arrays.
|
|
|
|
Replicates train.py apply_smoothing():
|
|
1. Replace 0 with NaN
|
|
2. Linear interpolate across time axis, fillna(0)
|
|
3. Savitzky-Golay: window_length=5, polyorder=2
|
|
|
|
Args:
|
|
timeseries_dict: Dict mapping index name to (H, W, T) array
|
|
dates: List of date strings in YYYYMMDD format
|
|
|
|
Returns:
|
|
Dict mapping index name to smoothed (H, W, T) array
|
|
"""
|
|
from scipy.signal import savgol_filter
|
|
|
|
smoothed = {}
|
|
n_times = len(dates)
|
|
|
|
for idx_name, arr in timeseries_dict.items():
|
|
# arr shape: (H, W, T)
|
|
H, W, T = arr.shape
|
|
|
|
# Reshape to (H*W, T) for vectorized processing
|
|
arr_2d = arr.reshape(-1, T)
|
|
|
|
# 1. Replace 0 with NaN
|
|
arr_2d = np.where(arr_2d == 0, np.nan, arr_2d)
|
|
|
|
# 2. Linear interpolate across time axis (axis=1)
|
|
# Handle each row (each pixel) independently
|
|
interp_rows = []
|
|
for row in arr_2d:
|
|
# Use pandas Series for linear interpolation
|
|
ser = pd.Series(row)
|
|
ser = ser.interpolate(method='linear', limit_direction='both')
|
|
interp_rows.append(ser.fillna(0).values)
|
|
interp_arr = np.array(interp_rows)
|
|
|
|
# 3. Apply Savitzky-Golay smoothing
|
|
# window_length=5, polyorder=2
|
|
smooth_arr = savgol_filter(interp_arr, window_length=5, polyorder=2, axis=1)
|
|
|
|
# Reshape back to (H, W, T)
|
|
smoothed[idx_name] = smooth_arr.reshape(H, W, T)
|
|
|
|
return smoothed
|
|
|
|
|
|
def extract_phenology_from_rasters(
|
|
timeseries_dict: Dict[str, np.ndarray],
|
|
dates: List[str],
|
|
indices: List[str] = ['ndvi', 'ndre', 'evi']
|
|
) -> Dict[str, np.ndarray]:
|
|
"""Extract phenology metrics from time-series raster arrays.
|
|
|
|
Replicates train.py extract_phenology():
|
|
- Magnitude: max, min, mean, std, amplitude
|
|
- AUC: trapezoid integral with dx=10
|
|
- Timing: peak_timestep (argmax)
|
|
- Slopes: max_slope_up, max_slope_down
|
|
|
|
Args:
|
|
timeseries_dict: Dict mapping index name to (H, W, T) array (should be smoothed)
|
|
dates: List of date strings
|
|
indices: Which indices to process
|
|
|
|
Returns:
|
|
Dict mapping feature name to (H, W) array
|
|
"""
|
|
from scipy.integrate import trapezoid
|
|
|
|
features = {}
|
|
|
|
for idx in indices:
|
|
if idx not in timeseries_dict:
|
|
continue
|
|
|
|
arr = timeseries_dict[idx] # (H, W, T)
|
|
H, W, T = arr.shape
|
|
|
|
# Reshape to (H*W, T) for vectorized processing
|
|
arr_2d = arr.reshape(-1, T)
|
|
|
|
# Magnitude Metrics
|
|
features[f'{idx}_max'] = np.max(arr_2d, axis=1).reshape(H, W)
|
|
features[f'{idx}_min'] = np.min(arr_2d, axis=1).reshape(H, W)
|
|
features[f'{idx}_mean'] = np.mean(arr_2d, axis=1).reshape(H, W)
|
|
features[f'{idx}_std'] = np.std(arr_2d, axis=1).reshape(H, W)
|
|
features[f'{idx}_amplitude'] = features[f'{idx}_max'] - features[f'{idx}_min']
|
|
|
|
# AUC (Area Under Curve) with dx=10 (10-day intervals)
|
|
features[f'{idx}_auc'] = trapezoid(arr_2d, dx=10, axis=1).reshape(H, W)
|
|
|
|
# Peak timestep (timing)
|
|
peak_indices = np.argmax(arr_2d, axis=1)
|
|
features[f'{idx}_peak_timestep'] = peak_indices.reshape(H, W)
|
|
|
|
# Slopes (rates of change)
|
|
slopes = np.diff(arr_2d, axis=1) # (H*W, T-1)
|
|
features[f'{idx}_max_slope_up'] = np.max(slopes, axis=1).reshape(H, W)
|
|
features[f'{idx}_max_slope_down'] = np.min(slopes, axis=1).reshape(H, W)
|
|
|
|
return features
|
|
|
|
|
|
def add_harmonics_to_rasters(
|
|
timeseries_dict: Dict[str, np.ndarray],
|
|
dates: List[str],
|
|
indices: List[str] = ['ndvi']
|
|
) -> Dict[str, np.ndarray]:
|
|
"""Add harmonic/fourier features from time-series raster arrays.
|
|
|
|
Replicates train.py add_harmonics():
|
|
- 1st order: sin(t), cos(t)
|
|
- 2nd order: sin(2t), cos(2t)
|
|
where t = 2*pi * time_step / n_times
|
|
|
|
Args:
|
|
timeseries_dict: Dict mapping index name to (H, W, T) array (should be smoothed)
|
|
dates: List of date strings
|
|
indices: Which indices to process
|
|
|
|
Returns:
|
|
Dict mapping feature name to (H, W) array
|
|
"""
|
|
features = {}
|
|
n_times = len(dates)
|
|
|
|
# Normalize time to 0-2pi (one full cycle)
|
|
time_steps = np.arange(n_times)
|
|
t = 2 * np.pi * time_steps / n_times
|
|
|
|
sin_t = np.sin(t)
|
|
cos_t = np.cos(t)
|
|
sin_2t = np.sin(2 * t)
|
|
cos_2t = np.cos(2 * t)
|
|
|
|
for idx in indices:
|
|
if idx not in timeseries_dict:
|
|
continue
|
|
|
|
arr = timeseries_dict[idx] # (H, W, T)
|
|
H, W, T = arr.shape
|
|
|
|
# Reshape to (H*W, T) for vectorized processing
|
|
arr_2d = arr.reshape(-1, T)
|
|
|
|
# Normalized dot products (harmonic coefficients)
|
|
features[f'{idx}_harmonic1_sin'] = np.dot(arr_2d, sin_t) / n_times
|
|
features[f'{idx}_harmonic1_cos'] = np.dot(arr_2d, cos_t) / n_times
|
|
features[f'{idx}_harmonic2_sin'] = np.dot(arr_2d, sin_2t) / n_times
|
|
features[f'{idx}_harmonic2_cos'] = np.dot(arr_2d, cos_2t) / n_times
|
|
|
|
# Reshape back to (H, W)
|
|
for feat_name in [f'{idx}_harmonic1_sin', f'{idx}_harmonic1_cos',
|
|
f'{idx}_harmonic2_sin', f'{idx}_harmonic2_cos']:
|
|
features[feat_name] = features[feat_name].reshape(H, W)
|
|
|
|
return features
|
|
|
|
|
|
def add_seasonal_windows_and_interactions(
|
|
timeseries_dict: Dict[str, np.ndarray],
|
|
dates: List[str],
|
|
indices: List[str] = ['ndvi', 'ndwi', 'ndre'],
|
|
phenology_features: Dict[str, np.ndarray] = None
|
|
) -> Dict[str, np.ndarray]:
|
|
"""Add seasonal window statistics and index interactions.
|
|
|
|
Replicates train.py add_interactions_and_windows():
|
|
- Seasonal windows (Zimbabwe season: Oct-Jun):
|
|
- Early: Oct-Dec (months 10, 11, 12)
|
|
- Peak: Jan-Mar (months 1, 2, 3)
|
|
- Late: Apr-Jun (months 4, 5, 6)
|
|
- Interactions:
|
|
- ndvi_ndre_peak_diff = ndvi_max - ndre_max
|
|
- canopy_density_contrast = evi_mean / (ndvi_mean + 0.001)
|
|
|
|
Args:
|
|
timeseries_dict: Dict mapping index name to (H, W, T) array
|
|
dates: List of date strings in YYYYMMDD format
|
|
indices: Which indices to process
|
|
phenology_features: Dict of phenology features for interactions
|
|
|
|
Returns:
|
|
Dict mapping feature name to (H, W) array
|
|
"""
|
|
features = {}
|
|
|
|
# Parse dates to identify months
|
|
dt_dates = pd.to_datetime(dates, format='%Y%m%d')
|
|
|
|
# Define seasonal windows (months)
|
|
windows = {
|
|
'early': [10, 11, 12], # Oct-Dec
|
|
'peak': [1, 2, 3], # Jan-Mar
|
|
'late': [4, 5, 6] # Apr-Jun
|
|
}
|
|
|
|
for idx in indices:
|
|
if idx not in timeseries_dict:
|
|
continue
|
|
|
|
arr = timeseries_dict[idx] # (H, W, T)
|
|
H, W, T = arr.shape
|
|
|
|
for win_name, months in windows.items():
|
|
# Find time indices belonging to this window
|
|
month_mask = np.array([d.month in months for d in dt_dates])
|
|
|
|
if not np.any(month_mask):
|
|
continue
|
|
|
|
# Extract window slice
|
|
window_arr = arr[:, :, month_mask] # (H, W, T_window)
|
|
|
|
# Compute statistics
|
|
window_2d = window_arr.reshape(-1, window_arr.shape[2])
|
|
features[f'{idx}_{win_name}_mean'] = np.mean(window_2d, axis=1).reshape(H, W)
|
|
features[f'{idx}_{win_name}_max'] = np.max(window_2d, axis=1).reshape(H, W)
|
|
|
|
# Add interactions (if phenology features available)
|
|
if phenology_features is not None:
|
|
# ndvi_ndre_peak_diff
|
|
if 'ndvi_max' in phenology_features and 'ndre_max' in phenology_features:
|
|
features['ndvi_ndre_peak_diff'] = (
|
|
phenology_features['ndvi_max'] - phenology_features['ndre_max']
|
|
)
|
|
|
|
# canopy_density_contrast
|
|
if 'evi_mean' in phenology_features and 'ndvi_mean' in phenology_features:
|
|
features['canopy_density_contrast'] = (
|
|
phenology_features['evi_mean'] / (phenology_features['ndvi_mean'] + 0.001)
|
|
)
|
|
|
|
return features
|
|
|
|
|
|
# ==========================================
|
|
# Inference helpers
|
|
# ==========================================
|
|
|
|
# AOI tuple: (lon, lat, radius_m)
|
|
AOI = Tuple[float, float, float]
|
|
|
|
|
|
def validate_aoi_zimbabwe(aoi: AOI, max_radius_m: float = 5000.0):
|
|
"""Basic AOI validation.
|
|
|
|
- Ensures radius <= max_radius_m
|
|
- Ensures AOI center is within rough Zimbabwe bounds.
|
|
|
|
NOTE: For production, use a real Zimbabwe polygon and check circle intersects.
|
|
You can load a simplified boundary GeoJSON and use shapely.
|
|
"""
|
|
lon, lat, radius_m = aoi
|
|
if radius_m <= 0 or radius_m > max_radius_m:
|
|
raise ValueError(f"radius_m must be in (0, {max_radius_m}]")
|
|
|
|
# Rough bbox for Zimbabwe (good cheap pre-check).
|
|
# Lon: 25.2 to 33.1, Lat: -22.5 to -15.6
|
|
if not (25.2 <= lon <= 33.1 and -22.5 <= lat <= -15.6):
|
|
raise ValueError("AOI must be within Zimbabwe")
|
|
|
|
|
|
def clip_raster_to_aoi(
|
|
src_path: str,
|
|
aoi: AOI,
|
|
dst_profile_like: Optional[dict] = None,
|
|
) -> Tuple[np.ndarray, dict]:
|
|
"""Clip a raster to AOI circle.
|
|
|
|
Template implementation: reads a window around the circle's bbox.
|
|
|
|
For exact circle mask, add a mask step after reading.
|
|
"""
|
|
lon, lat, radius_m = aoi
|
|
|
|
with rasterio.open(src_path) as src:
|
|
# Approx bbox from radius using rough degrees conversion.
|
|
# Production: use pyproj geodesic buffer.
|
|
deg = radius_m / 111_320.0
|
|
minx, maxx = lon - deg, lon + deg
|
|
miny, maxy = lat - deg, lat + deg
|
|
|
|
window = rasterio.windows.from_bounds(minx, miny, maxx, maxy, transform=src.transform)
|
|
window = window.round_offsets().round_lengths()
|
|
|
|
arr = src.read(1, window=window)
|
|
profile = src.profile.copy()
|
|
|
|
# Update transform for the window
|
|
profile.update(
|
|
{
|
|
"height": arr.shape[0],
|
|
"width": arr.shape[1],
|
|
"transform": rasterio.windows.transform(window, src.transform),
|
|
}
|
|
)
|
|
|
|
# Optional: resample/align to dst_profile_like
|
|
if dst_profile_like is not None:
|
|
arr, profile = _resample_to_profile(arr, profile, dst_profile_like)
|
|
|
|
return arr, profile
|
|
|
|
|
|
def _resample_to_profile(arr: np.ndarray, src_profile: dict, dst_profile: dict) -> Tuple[np.ndarray, dict]:
|
|
"""Nearest-neighbor resample to match dst grid."""
|
|
dst_h = dst_profile["height"]
|
|
dst_w = dst_profile["width"]
|
|
|
|
dst_arr = np.empty((dst_h, dst_w), dtype=arr.dtype)
|
|
with rasterio.io.MemoryFile() as mem:
|
|
with mem.open(**src_profile) as src:
|
|
src.write(arr, 1)
|
|
rasterio.warp.reproject(
|
|
source=rasterio.band(src, 1),
|
|
destination=dst_arr,
|
|
src_transform=src_profile["transform"],
|
|
src_crs=src_profile["crs"],
|
|
dst_transform=dst_profile["transform"],
|
|
dst_crs=dst_profile["crs"],
|
|
resampling=Resampling.nearest,
|
|
)
|
|
|
|
prof = dst_profile.copy()
|
|
prof.update({"count": 1, "dtype": str(dst_arr.dtype)})
|
|
return dst_arr, prof
|
|
|
|
|
|
def load_dw_baseline_window(cfg, year: int, season: str, aoi: AOI) -> Tuple[np.ndarray, dict]:
|
|
"""Loads the DW baseline seasonal COG from MinIO and clips to AOI.
|
|
|
|
The cfg.storage implementation decides whether to stream or download locally.
|
|
|
|
Expected naming convention:
|
|
dw_{season}_{year}.tif OR DW_Zim_HighestConf_{year}_{year+1}.tif
|
|
|
|
You can implement a mapping in cfg.dw_key_for(year, season).
|
|
"""
|
|
local_path = cfg.storage.get_dw_local_path(year=year, season=season)
|
|
arr, profile = clip_raster_to_aoi(local_path, aoi)
|
|
|
|
# Ensure a single band profile
|
|
profile.update({"count": 1})
|
|
if "dtype" not in profile:
|
|
profile["dtype"] = str(arr.dtype)
|
|
|
|
return arr, profile
|
|
|
|
|
|
# -------------------------
|
|
# DEA STAC feature stack
|
|
# -------------------------
|
|
|
|
def compute_indices_from_bands(
|
|
red: np.ndarray,
|
|
nir: np.ndarray,
|
|
blue: np.ndarray = None,
|
|
green: np.ndarray = None,
|
|
swir1: np.ndarray = None,
|
|
swir2: np.ndarray = None
|
|
) -> Dict[str, np.ndarray]:
|
|
"""Compute vegetation indices from band arrays.
|
|
|
|
Indices computed:
|
|
- NDVI = (NIR - Red) / (NIR + Red)
|
|
- EVI = 2.5 * (NIR - Red) / (NIR + 6*Red - 7.5*Blue + 1)
|
|
- SAVI = ((NIR - Red) / (NIR + Red + L)) * (1 + L) where L=0.5
|
|
- NDRE = (NIR - RedEdge) / (NIR + RedEdge)
|
|
- CI_RE = (NIR / RedEdge) - 1
|
|
- NDWI = (Green - NIR) / (Green + NIR)
|
|
|
|
Args:
|
|
red: Red band (B4)
|
|
nir: NIR band (B8)
|
|
blue: Blue band (B2, optional)
|
|
green: Green band (B3, optional)
|
|
swir1: SWIR1 band (B11, optional)
|
|
swir2: SWIR2 band (B12, optional)
|
|
|
|
Returns:
|
|
Dict mapping index name to array
|
|
"""
|
|
indices = {}
|
|
|
|
# Ensure float64 for precision
|
|
nir = nir.astype(np.float64)
|
|
red = red.astype(np.float64)
|
|
|
|
# NDVI = (NIR - Red) / (NIR + Red)
|
|
denominator = nir + red
|
|
indices['ndvi'] = np.where(denominator != 0, (nir - red) / denominator, 0)
|
|
|
|
# EVI = 2.5 * (NIR - Red) / (NIR + 6*Red - 7.5*Blue + 1)
|
|
if blue is not None:
|
|
blue = blue.astype(np.float64)
|
|
evi_denom = nir + 6*red - 7.5*blue + 1
|
|
indices['evi'] = np.where(evi_denom != 0, 2.5 * (nir - red) / evi_denom, 0)
|
|
|
|
# SAVI = ((NIR - Red) / (NIR + Red + L)) * (1 + L) where L=0.5
|
|
L = 0.5
|
|
savi_denom = nir + red + L
|
|
indices['savi'] = np.where(savi_denom != 0, ((nir - red) / savi_denom) * (1 + L), 0)
|
|
|
|
# NDRE = (NIR - RedEdge) / (NIR + RedEdge)
|
|
# RedEdge is typically B5 (705nm) - use NIR if not available
|
|
if 'rededge' in locals() and rededge is not None:
|
|
rededge = rededge.astype(np.float64)
|
|
ndre_denom = nir + rededge
|
|
indices['ndre'] = np.where(ndre_denom != 0, (nir - rededge) / ndre_denom, 0)
|
|
# CI_RE = (NIR / RedEdge) - 1
|
|
indices['ci_re'] = np.where(rededge != 0, (nir / rededge) - 1, 0)
|
|
else:
|
|
# Fallback: use SWIR1 as proxy for red-edge if available
|
|
if swir1 is not None:
|
|
swir1 = swir1.astype(np.float64)
|
|
ndre_denom = nir + swir1
|
|
indices['ndre'] = np.where(ndre_denom != 0, (nir - swir1) / ndre_denom, 0)
|
|
indices['ci_re'] = np.where(swir1 != 0, (nir / swir1) - 1, 0)
|
|
|
|
# NDWI = (Green - NIR) / (Green + NIR)
|
|
if green is not None:
|
|
green = green.astype(np.float64)
|
|
ndwi_denom = green + nir
|
|
indices['ndwi'] = np.where(ndwi_denom != 0, (green - nir) / ndwi_denom, 0)
|
|
|
|
return indices
|
|
|
|
|
|
def build_feature_stack_from_dea(
|
|
cfg,
|
|
aoi: AOI,
|
|
start_date: str,
|
|
end_date: str,
|
|
target_profile: dict,
|
|
) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]:
|
|
"""Query DEA STAC and compute a per-pixel feature cube.
|
|
|
|
This function implements the FULL feature engineering pipeline matching train.py:
|
|
1. Load Sentinel-2 data from DEA STAC
|
|
2. Compute indices (ndvi, ndre, evi, savi, ci_re, ndwi)
|
|
3. Apply Savitzky-Golay smoothing with 0-interpolation
|
|
4. Extract phenology metrics (amplitude, AUC, peak, slope)
|
|
5. Add harmonic/fourier features
|
|
6. Add seasonal window statistics
|
|
7. Add index interactions
|
|
|
|
Returns:
|
|
feat_arr: (H, W, C)
|
|
feat_profile: raster profile aligned to target_profile
|
|
feat_names: list[str]
|
|
aux_layers: dict for extra outputs (true_color, ndvi, evi, savi)
|
|
|
|
"""
|
|
# Import STAC dependencies
|
|
try:
|
|
import pystac_client
|
|
import stackstac
|
|
except ImportError:
|
|
raise ImportError("pystac-client and stackstac are required for DEA STAC loading")
|
|
|
|
from scipy.signal import savgol_filter
|
|
from scipy.integrate import trapezoid
|
|
|
|
H = target_profile["height"]
|
|
W = target_profile["width"]
|
|
|
|
# DEA STAC configuration
|
|
stac_url = cfg.dea_stac_url if hasattr(cfg, 'dea_stac_url') else "https://explorer.digitalearth.africa/stac"
|
|
|
|
# AOI to bbox
|
|
lon, lat, radius_m = aoi
|
|
deg = radius_m / 111_320.0
|
|
bbox = [lon - deg, lat - deg, lon + deg, lat + deg]
|
|
|
|
# Query DEA STAC
|
|
print(f"🔍 Querying DEA STAC: {stac_url}")
|
|
print(f" _bbox: {bbox}")
|
|
print(f" _dates: {start_date} to {end_date}")
|
|
|
|
try:
|
|
client = pystac_client.Client.open(stac_url)
|
|
|
|
# Search for Sentinel-2 L2A
|
|
search = client.search(
|
|
collections=["s2_l2a"],
|
|
bbox=bbox,
|
|
datetime=f"{start_date}/{end_date}",
|
|
query={
|
|
"eo:cloud_cover": {"lt": 30}, # Cloud filter
|
|
}
|
|
)
|
|
|
|
items = list(search.items())
|
|
print(f" Found {len(items)} Sentinel-2 scenes")
|
|
|
|
if len(items) == 0:
|
|
raise ValueError("No Sentinel-2 imagery available for the selected AOI and date range")
|
|
|
|
# Load data using stackstac
|
|
# Required bands: red, green, blue, nir, rededge (B5), swir1, swir2
|
|
bands = ["red", "green", "blue", "nir", "nir08", "nir09", "swir16", "swir22"]
|
|
|
|
cube = stackstac.stack(
|
|
items,
|
|
bounds=bbox,
|
|
resolution=10, # 10m (Sentinel-2 native)
|
|
bands=bands,
|
|
chunks={"x": 512, "y": 512},
|
|
epsg=32736, # UTM Zone 36S (Zimbabwe)
|
|
)
|
|
|
|
print(f" Loaded cube shape: {cube.shape}")
|
|
|
|
except Exception as e:
|
|
print(f" ⚠️ DEA STAC loading failed: {e}")
|
|
print(f" Returning placeholder features for development")
|
|
return _build_placeholder_features(H, W, target_profile)
|
|
|
|
# Extract dates from the cube
|
|
cube_dates = pd.to_datetime(cube.time.values)
|
|
date_strings = [d.strftime('%Y%m%d') for d in cube_dates]
|
|
|
|
# Get band data - stackstac returns (T, C, H, W), transpose to (C, T, H, W)
|
|
band_data = cube.values # (T, C, H, W)
|
|
n_times = band_data.shape[0]
|
|
|
|
# Map bands to names
|
|
band_names = list(cube.band.values)
|
|
|
|
# Extract individual bands
|
|
def get_band_data(band_name):
|
|
idx = band_names.index(band_name) if band_name in band_names else 0
|
|
# Shape: (T, H, W)
|
|
return band_data[:, idx, :, :]
|
|
|
|
# Build timeseries dict for each index
|
|
# Compute indices for each timestep
|
|
indices_list = []
|
|
|
|
# Get available bands
|
|
available_bands = {}
|
|
for bn in ['red', 'green', 'blue', 'nir', 'nir08', 'nir09', 'swir16', 'swir22']:
|
|
if bn in band_names:
|
|
available_bands[bn] = get_band_data(bn)
|
|
|
|
# Compute indices for each timestep
|
|
timeseries_dict = {}
|
|
|
|
for t in range(n_times):
|
|
# Get bands for this timestep
|
|
bands_t = {k: v[t] for k, v in available_bands.items()}
|
|
|
|
# Compute indices
|
|
red = bands_t.get('red', None)
|
|
nir = bands_t.get('nir', None)
|
|
green = bands_t.get('green', None)
|
|
blue = bands_t.get('blue', None)
|
|
nir08 = bands_t.get('nir08', None) # B8A (red-edge)
|
|
swir16 = bands_t.get('swir16', None) # B11
|
|
swir22 = bands_t.get('swir22', None) # B12
|
|
|
|
if red is None or nir is None:
|
|
continue
|
|
|
|
# Compute indices at this timestep
|
|
# Use nir08 as red-edge if available, else swir16 as proxy
|
|
rededge = nir08 if nir08 is not None else (swir16 if swir16 is not None else None)
|
|
|
|
indices_t = compute_indices_from_bands(
|
|
red=red,
|
|
nir=nir,
|
|
blue=blue,
|
|
green=green,
|
|
swir1=swir16,
|
|
swir2=swir22
|
|
)
|
|
|
|
# Add NDRE and CI_RE if we have red-edge
|
|
if rededge is not None:
|
|
denom = nir + rededge
|
|
indices_t['ndre'] = np.where(denom != 0, (nir - rededge) / denom, 0)
|
|
indices_t['ci_re'] = np.where(rededge != 0, (nir / rededge) - 1, 0)
|
|
|
|
# Stack into timeseries
|
|
for idx_name, idx_arr in indices_t.items():
|
|
if idx_name not in timeseries_dict:
|
|
timeseries_dict[idx_name] = np.zeros((H, W, n_times), dtype=np.float32)
|
|
timeseries_dict[idx_name][:, :, t] = idx_arr.astype(np.float32)
|
|
|
|
# Ensure at least one index exists
|
|
if not timeseries_dict:
|
|
print(" ⚠️ No indices computed, returning placeholders")
|
|
return _build_placeholder_features(H, W, target_profile)
|
|
|
|
# ========================================
|
|
# Apply Feature Engineering Pipeline
|
|
# (matching train.py exactly)
|
|
# ========================================
|
|
|
|
print(" 🔧 Applying feature engineering pipeline...")
|
|
|
|
# 1. Apply smoothing (Savitzky-Golay)
|
|
print(" - Smoothing (Savitzky-Golay window=5, polyorder=2)")
|
|
smoothed_dict = apply_smoothing_to_rasters(timeseries_dict, date_strings)
|
|
|
|
# 2. Extract phenology
|
|
print(" - Phenology metrics (amplitude, AUC, peak, slope)")
|
|
phenology_features = extract_phenology_from_rasters(
|
|
smoothed_dict, date_strings,
|
|
indices=['ndvi', 'ndre', 'evi', 'savi']
|
|
)
|
|
|
|
# 3. Add harmonics
|
|
print(" - Harmonic features (1st/2nd order sin/cos)")
|
|
harmonic_features = add_harmonics_to_rasters(
|
|
smoothed_dict, date_strings,
|
|
indices=['ndvi', 'ndre', 'evi']
|
|
)
|
|
|
|
# 4. Seasonal windows + interactions
|
|
print(" - Seasonal windows (Early/Peak/Late) + interactions")
|
|
window_features = add_seasonal_windows_and_interactions(
|
|
smoothed_dict, date_strings,
|
|
indices=['ndvi', 'ndwi', 'ndre'],
|
|
phenology_features=phenology_features
|
|
)
|
|
|
|
# ========================================
|
|
# Combine all features
|
|
# ========================================
|
|
|
|
# Collect all features in order
|
|
all_features = {}
|
|
all_features.update(phenology_features)
|
|
all_features.update(harmonic_features)
|
|
all_features.update(window_features)
|
|
|
|
# Get feature names in consistent order
|
|
# Order: phenology (ndvi) -> phenology (ndre) -> phenology (evi) -> phenology (savi)
|
|
# -> harmonics -> windows -> interactions
|
|
feat_names = []
|
|
|
|
# Phenology order: ndvi, ndre, evi, savi
|
|
for idx in ['ndvi', 'ndre', 'evi', 'savi']:
|
|
for suffix in ['_max', '_min', '_mean', '_std', '_amplitude', '_auc', '_peak_timestep', '_max_slope_up', '_max_slope_down']:
|
|
key = f'{idx}{suffix}'
|
|
if key in all_features:
|
|
feat_names.append(key)
|
|
|
|
# Harmonics order: ndvi, ndre, evi
|
|
for idx in ['ndvi', 'ndre', 'evi']:
|
|
for suffix in ['_harmonic1_sin', '_harmonic1_cos', '_harmonic2_sin', '_harmonic2_cos']:
|
|
key = f'{idx}{suffix}'
|
|
if key in all_features:
|
|
feat_names.append(key)
|
|
|
|
# Window features: ndvi, ndwi, ndre (early, peak, late)
|
|
for idx in ['ndvi', 'ndwi', 'ndre']:
|
|
for win in ['early', 'peak', 'late']:
|
|
for stat in ['_mean', '_max']:
|
|
key = f'{idx}_{win}{stat}'
|
|
if key in all_features:
|
|
feat_names.append(key)
|
|
|
|
# Interactions
|
|
if 'ndvi_ndre_peak_diff' in all_features:
|
|
feat_names.append('ndvi_ndre_peak_diff')
|
|
if 'canopy_density_contrast' in all_features:
|
|
feat_names.append('canopy_density_contrast')
|
|
|
|
print(f" Total features: {len(feat_names)}")
|
|
|
|
# Build feature array
|
|
feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32)
|
|
for i, feat_name in enumerate(feat_names):
|
|
if feat_name in all_features:
|
|
feat_arr[:, :, i] = all_features[feat_name]
|
|
|
|
# Handle NaN/Inf
|
|
feat_arr = np.nan_to_num(feat_arr, nan=0.0, posinf=0.0, neginf=0.0)
|
|
|
|
# ========================================
|
|
# Build aux layers for visualization
|
|
# ========================================
|
|
|
|
aux_layers = {}
|
|
|
|
# True color (use first clear observation)
|
|
if 'red' in available_bands and 'green' in available_bands and 'blue' in available_bands:
|
|
# Get median of clear observations
|
|
red_arr = available_bands['red'] # (T, H, W)
|
|
green_arr = available_bands['green']
|
|
blue_arr = available_bands['blue']
|
|
|
|
# Simple median composite
|
|
tc = np.stack([
|
|
np.median(red_arr, axis=0),
|
|
np.median(green_arr, axis=0),
|
|
np.median(blue_arr, axis=0),
|
|
], axis=-1)
|
|
aux_layers['true_color'] = tc.astype(np.uint16)
|
|
|
|
# Index peaks for visualization
|
|
for idx in ['ndvi', 'evi', 'savi']:
|
|
if f'{idx}_max' in all_features:
|
|
aux_layers[f'{idx}_peak'] = all_features[f'{idx}_max']
|
|
|
|
feat_profile = target_profile.copy()
|
|
feat_profile.update({"count": 1, "dtype": "float32"})
|
|
|
|
return feat_arr, feat_profile, feat_names, aux_layers
|
|
|
|
|
|
def _build_placeholder_features(H: int, W: int, target_profile: dict) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]:
|
|
"""Build placeholder features when DEA STAC is unavailable.
|
|
|
|
This allows the pipeline to run during development without API access.
|
|
"""
|
|
# Minimal feature set matching training expected features
|
|
feat_names = ["ndvi_peak", "evi_peak", "savi_peak"]
|
|
feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32)
|
|
|
|
aux_layers = {
|
|
"true_color": np.zeros((H, W, 3), dtype=np.uint16),
|
|
"ndvi_peak": np.zeros((H, W), dtype=np.float32),
|
|
"evi_peak": np.zeros((H, W), dtype=np.float32),
|
|
"savi_peak": np.zeros((H, W), dtype=np.float32),
|
|
}
|
|
|
|
feat_profile = target_profile.copy()
|
|
feat_profile.update({"count": 1, "dtype": "float32"})
|
|
|
|
return feat_arr, feat_profile, feat_names, aux_layers
|
|
|
|
|
|
# -------------------------
|
|
# Neighborhood smoothing
|
|
# -------------------------
|
|
|
|
def majority_filter(arr: np.ndarray, k: int = 3) -> np.ndarray:
|
|
"""Majority filter for 2D class label arrays.
|
|
|
|
arr may be dtype string (labels) or integers. For strings, we use a slower
|
|
path with unique counts.
|
|
|
|
k must be odd (3,5,7).
|
|
|
|
NOTE: This is a simple CPU implementation. For speed:
|
|
- convert labels to ints
|
|
- use scipy.ndimage or numba
|
|
- or apply with rasterio/gdal focal statistics
|
|
"""
|
|
if k % 2 == 0 or k < 3:
|
|
raise ValueError("k must be odd and >= 3")
|
|
|
|
pad = k // 2
|
|
H, W = arr.shape
|
|
padded = np.pad(arr, ((pad, pad), (pad, pad)), mode="edge")
|
|
|
|
out = arr.copy()
|
|
|
|
# If numeric, use bincount fast path
|
|
if np.issubdtype(arr.dtype, np.integer):
|
|
maxv = int(arr.max()) if arr.size else 0
|
|
for y in range(H):
|
|
for x in range(W):
|
|
win = padded[y : y + k, x : x + k].ravel()
|
|
counts = np.bincount(win, minlength=maxv + 1)
|
|
out[y, x] = counts.argmax()
|
|
return out
|
|
|
|
# String/obj path
|
|
for y in range(H):
|
|
for x in range(W):
|
|
win = padded[y : y + k, x : x + k].ravel()
|
|
vals, counts = np.unique(win, return_counts=True)
|
|
out[y, x] = vals[counts.argmax()]
|
|
|
|
return out
|