geocrop-platform./apps/worker/features.py

880 lines
30 KiB
Python

"""Feature engineering + geospatial helpers for GeoCrop.
This module is shared by training (feature selection + scaling helpers)
AND inference (DEA STAC fetch + raster alignment + smoothing).
IMPORTANT: This implementation exactly replicates train.py feature engineering:
- Savitzky-Golay smoothing (window=5, polyorder=2) with 0-interpolation
- Phenology metrics (amplitude, AUC, peak_timestep, max_slope)
- Harmonic/Fourier features (1st and 2nd order sin/cos)
- Seasonal window statistics (Early: Oct-Dec, Peak: Jan-Mar, Late: Apr-Jun)
"""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from datetime import date
from typing import Dict, Iterable, List, Optional, Tuple
import numpy as np
import pandas as pd
# Raster / geo
import rasterio
from rasterio.enums import Resampling
# ==========================================
# Training helpers
# ==========================================
def drop_junk_columns(df: pd.DataFrame, junk_cols: List[str]) -> pd.DataFrame:
"""Drop junk/spatial columns that would cause data leakage.
Matches train.py junk_cols: ['.geo', 'system:index', 'latitude', 'longitude',
'lat', 'lon', 'ID', 'parent_id', 'batch_id', 'is_syn']
"""
cols_to_drop = [c for c in junk_cols if c in df.columns]
return df.drop(columns=cols_to_drop)
def scout_feature_selection(
X_train: pd.DataFrame,
y_train: np.ndarray,
n_estimators: int = 100,
random_state: int = 42,
) -> List[str]:
"""Scout LightGBM feature selection (keeps non-zero importances)."""
import lightgbm as lgb
lgbm = lgb.LGBMClassifier(n_estimators=n_estimators, random_state=random_state, verbose=-1)
lgbm.fit(X_train, y_train)
importances = pd.DataFrame(
{"Feature": X_train.columns, "Importance": lgbm.feature_importances_}
).sort_values("Importance", ascending=False)
selected = importances[importances["Importance"] > 0]["Feature"].tolist()
if not selected:
# Fallback: keep everything (better than breaking training)
selected = list(X_train.columns)
return selected
def scale_numeric_features(
X_train: pd.DataFrame,
X_test: pd.DataFrame,
):
"""Scale only numeric columns, return (X_train_scaled, X_test_scaled, scaler).
Uses StandardScaler (matches train.py).
"""
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_cols = X_train.select_dtypes(include=[np.number]).columns
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])
return X_train_scaled, X_test_scaled, scaler
# ==========================================
# INFERENCE-ONLY FEATURE ENGINEERING
# These functions replicate train.py for raster-based inference
# ==========================================
def apply_smoothing_to_rasters(
timeseries_dict: Dict[str, np.ndarray],
dates: List[str]
) -> Dict[str, np.ndarray]:
"""Apply Savitzky-Golay smoothing to time-series raster arrays.
Replicates train.py apply_smoothing():
1. Replace 0 with NaN
2. Linear interpolate across time axis, fillna(0)
3. Savitzky-Golay: window_length=5, polyorder=2
Args:
timeseries_dict: Dict mapping index name to (H, W, T) array
dates: List of date strings in YYYYMMDD format
Returns:
Dict mapping index name to smoothed (H, W, T) array
"""
from scipy.signal import savgol_filter
smoothed = {}
n_times = len(dates)
for idx_name, arr in timeseries_dict.items():
# arr shape: (H, W, T)
H, W, T = arr.shape
# Reshape to (H*W, T) for vectorized processing
arr_2d = arr.reshape(-1, T)
# 1. Replace 0 with NaN
arr_2d = np.where(arr_2d == 0, np.nan, arr_2d)
# 2. Linear interpolate across time axis (axis=1)
# Handle each row (each pixel) independently
interp_rows = []
for row in arr_2d:
# Use pandas Series for linear interpolation
ser = pd.Series(row)
ser = ser.interpolate(method='linear', limit_direction='both')
interp_rows.append(ser.fillna(0).values)
interp_arr = np.array(interp_rows)
# 3. Apply Savitzky-Golay smoothing
# window_length=5, polyorder=2
smooth_arr = savgol_filter(interp_arr, window_length=5, polyorder=2, axis=1)
# Reshape back to (H, W, T)
smoothed[idx_name] = smooth_arr.reshape(H, W, T)
return smoothed
def extract_phenology_from_rasters(
timeseries_dict: Dict[str, np.ndarray],
dates: List[str],
indices: List[str] = ['ndvi', 'ndre', 'evi']
) -> Dict[str, np.ndarray]:
"""Extract phenology metrics from time-series raster arrays.
Replicates train.py extract_phenology():
- Magnitude: max, min, mean, std, amplitude
- AUC: trapezoid integral with dx=10
- Timing: peak_timestep (argmax)
- Slopes: max_slope_up, max_slope_down
Args:
timeseries_dict: Dict mapping index name to (H, W, T) array (should be smoothed)
dates: List of date strings
indices: Which indices to process
Returns:
Dict mapping feature name to (H, W) array
"""
from scipy.integrate import trapezoid
features = {}
for idx in indices:
if idx not in timeseries_dict:
continue
arr = timeseries_dict[idx] # (H, W, T)
H, W, T = arr.shape
# Reshape to (H*W, T) for vectorized processing
arr_2d = arr.reshape(-1, T)
# Magnitude Metrics
features[f'{idx}_max'] = np.max(arr_2d, axis=1).reshape(H, W)
features[f'{idx}_min'] = np.min(arr_2d, axis=1).reshape(H, W)
features[f'{idx}_mean'] = np.mean(arr_2d, axis=1).reshape(H, W)
features[f'{idx}_std'] = np.std(arr_2d, axis=1).reshape(H, W)
features[f'{idx}_amplitude'] = features[f'{idx}_max'] - features[f'{idx}_min']
# AUC (Area Under Curve) with dx=10 (10-day intervals)
features[f'{idx}_auc'] = trapezoid(arr_2d, dx=10, axis=1).reshape(H, W)
# Peak timestep (timing)
peak_indices = np.argmax(arr_2d, axis=1)
features[f'{idx}_peak_timestep'] = peak_indices.reshape(H, W)
# Slopes (rates of change)
slopes = np.diff(arr_2d, axis=1) # (H*W, T-1)
features[f'{idx}_max_slope_up'] = np.max(slopes, axis=1).reshape(H, W)
features[f'{idx}_max_slope_down'] = np.min(slopes, axis=1).reshape(H, W)
return features
def add_harmonics_to_rasters(
timeseries_dict: Dict[str, np.ndarray],
dates: List[str],
indices: List[str] = ['ndvi']
) -> Dict[str, np.ndarray]:
"""Add harmonic/fourier features from time-series raster arrays.
Replicates train.py add_harmonics():
- 1st order: sin(t), cos(t)
- 2nd order: sin(2t), cos(2t)
where t = 2*pi * time_step / n_times
Args:
timeseries_dict: Dict mapping index name to (H, W, T) array (should be smoothed)
dates: List of date strings
indices: Which indices to process
Returns:
Dict mapping feature name to (H, W) array
"""
features = {}
n_times = len(dates)
# Normalize time to 0-2pi (one full cycle)
time_steps = np.arange(n_times)
t = 2 * np.pi * time_steps / n_times
sin_t = np.sin(t)
cos_t = np.cos(t)
sin_2t = np.sin(2 * t)
cos_2t = np.cos(2 * t)
for idx in indices:
if idx not in timeseries_dict:
continue
arr = timeseries_dict[idx] # (H, W, T)
H, W, T = arr.shape
# Reshape to (H*W, T) for vectorized processing
arr_2d = arr.reshape(-1, T)
# Normalized dot products (harmonic coefficients)
features[f'{idx}_harmonic1_sin'] = np.dot(arr_2d, sin_t) / n_times
features[f'{idx}_harmonic1_cos'] = np.dot(arr_2d, cos_t) / n_times
features[f'{idx}_harmonic2_sin'] = np.dot(arr_2d, sin_2t) / n_times
features[f'{idx}_harmonic2_cos'] = np.dot(arr_2d, cos_2t) / n_times
# Reshape back to (H, W)
for feat_name in [f'{idx}_harmonic1_sin', f'{idx}_harmonic1_cos',
f'{idx}_harmonic2_sin', f'{idx}_harmonic2_cos']:
features[feat_name] = features[feat_name].reshape(H, W)
return features
def add_seasonal_windows_and_interactions(
timeseries_dict: Dict[str, np.ndarray],
dates: List[str],
indices: List[str] = ['ndvi', 'ndwi', 'ndre'],
phenology_features: Dict[str, np.ndarray] = None
) -> Dict[str, np.ndarray]:
"""Add seasonal window statistics and index interactions.
Replicates train.py add_interactions_and_windows():
- Seasonal windows (Zimbabwe season: Oct-Jun):
- Early: Oct-Dec (months 10, 11, 12)
- Peak: Jan-Mar (months 1, 2, 3)
- Late: Apr-Jun (months 4, 5, 6)
- Interactions:
- ndvi_ndre_peak_diff = ndvi_max - ndre_max
- canopy_density_contrast = evi_mean / (ndvi_mean + 0.001)
Args:
timeseries_dict: Dict mapping index name to (H, W, T) array
dates: List of date strings in YYYYMMDD format
indices: Which indices to process
phenology_features: Dict of phenology features for interactions
Returns:
Dict mapping feature name to (H, W) array
"""
features = {}
# Parse dates to identify months
dt_dates = pd.to_datetime(dates, format='%Y%m%d')
# Define seasonal windows (months)
windows = {
'early': [10, 11, 12], # Oct-Dec
'peak': [1, 2, 3], # Jan-Mar
'late': [4, 5, 6] # Apr-Jun
}
for idx in indices:
if idx not in timeseries_dict:
continue
arr = timeseries_dict[idx] # (H, W, T)
H, W, T = arr.shape
for win_name, months in windows.items():
# Find time indices belonging to this window
month_mask = np.array([d.month in months for d in dt_dates])
if not np.any(month_mask):
continue
# Extract window slice
window_arr = arr[:, :, month_mask] # (H, W, T_window)
# Compute statistics
window_2d = window_arr.reshape(-1, window_arr.shape[2])
features[f'{idx}_{win_name}_mean'] = np.mean(window_2d, axis=1).reshape(H, W)
features[f'{idx}_{win_name}_max'] = np.max(window_2d, axis=1).reshape(H, W)
# Add interactions (if phenology features available)
if phenology_features is not None:
# ndvi_ndre_peak_diff
if 'ndvi_max' in phenology_features and 'ndre_max' in phenology_features:
features['ndvi_ndre_peak_diff'] = (
phenology_features['ndvi_max'] - phenology_features['ndre_max']
)
# canopy_density_contrast
if 'evi_mean' in phenology_features and 'ndvi_mean' in phenology_features:
features['canopy_density_contrast'] = (
phenology_features['evi_mean'] / (phenology_features['ndvi_mean'] + 0.001)
)
return features
# ==========================================
# Inference helpers
# ==========================================
# AOI tuple: (lon, lat, radius_m)
AOI = Tuple[float, float, float]
def validate_aoi_zimbabwe(aoi: AOI, max_radius_m: float = 5000.0):
"""Basic AOI validation.
- Ensures radius <= max_radius_m
- Ensures AOI center is within rough Zimbabwe bounds.
NOTE: For production, use a real Zimbabwe polygon and check circle intersects.
You can load a simplified boundary GeoJSON and use shapely.
"""
lon, lat, radius_m = aoi
if radius_m <= 0 or radius_m > max_radius_m:
raise ValueError(f"radius_m must be in (0, {max_radius_m}]")
# Rough bbox for Zimbabwe (good cheap pre-check).
# Lon: 25.2 to 33.1, Lat: -22.5 to -15.6
if not (25.2 <= lon <= 33.1 and -22.5 <= lat <= -15.6):
raise ValueError("AOI must be within Zimbabwe")
def clip_raster_to_aoi(
src_path: str,
aoi: AOI,
dst_profile_like: Optional[dict] = None,
) -> Tuple[np.ndarray, dict]:
"""Clip a raster to AOI circle.
Template implementation: reads a window around the circle's bbox.
For exact circle mask, add a mask step after reading.
"""
lon, lat, radius_m = aoi
with rasterio.open(src_path) as src:
# Approx bbox from radius using rough degrees conversion.
# Production: use pyproj geodesic buffer.
deg = radius_m / 111_320.0
minx, maxx = lon - deg, lon + deg
miny, maxy = lat - deg, lat + deg
window = rasterio.windows.from_bounds(minx, miny, maxx, maxy, transform=src.transform)
window = window.round_offsets().round_lengths()
arr = src.read(1, window=window)
profile = src.profile.copy()
# Update transform for the window
profile.update(
{
"height": arr.shape[0],
"width": arr.shape[1],
"transform": rasterio.windows.transform(window, src.transform),
}
)
# Optional: resample/align to dst_profile_like
if dst_profile_like is not None:
arr, profile = _resample_to_profile(arr, profile, dst_profile_like)
return arr, profile
def _resample_to_profile(arr: np.ndarray, src_profile: dict, dst_profile: dict) -> Tuple[np.ndarray, dict]:
"""Nearest-neighbor resample to match dst grid."""
dst_h = dst_profile["height"]
dst_w = dst_profile["width"]
dst_arr = np.empty((dst_h, dst_w), dtype=arr.dtype)
with rasterio.io.MemoryFile() as mem:
with mem.open(**src_profile) as src:
src.write(arr, 1)
rasterio.warp.reproject(
source=rasterio.band(src, 1),
destination=dst_arr,
src_transform=src_profile["transform"],
src_crs=src_profile["crs"],
dst_transform=dst_profile["transform"],
dst_crs=dst_profile["crs"],
resampling=Resampling.nearest,
)
prof = dst_profile.copy()
prof.update({"count": 1, "dtype": str(dst_arr.dtype)})
return dst_arr, prof
def load_dw_baseline_window(cfg, year: int, season: str, aoi: AOI) -> Tuple[np.ndarray, dict]:
"""Loads the DW baseline seasonal COG from MinIO and clips to AOI.
The cfg.storage implementation decides whether to stream or download locally.
Expected naming convention:
dw_{season}_{year}.tif OR DW_Zim_HighestConf_{year}_{year+1}.tif
You can implement a mapping in cfg.dw_key_for(year, season).
"""
local_path = cfg.storage.get_dw_local_path(year=year, season=season)
arr, profile = clip_raster_to_aoi(local_path, aoi)
# Ensure a single band profile
profile.update({"count": 1})
if "dtype" not in profile:
profile["dtype"] = str(arr.dtype)
return arr, profile
# -------------------------
# DEA STAC feature stack
# -------------------------
def compute_indices_from_bands(
red: np.ndarray,
nir: np.ndarray,
blue: np.ndarray = None,
green: np.ndarray = None,
swir1: np.ndarray = None,
swir2: np.ndarray = None
) -> Dict[str, np.ndarray]:
"""Compute vegetation indices from band arrays.
Indices computed:
- NDVI = (NIR - Red) / (NIR + Red)
- EVI = 2.5 * (NIR - Red) / (NIR + 6*Red - 7.5*Blue + 1)
- SAVI = ((NIR - Red) / (NIR + Red + L)) * (1 + L) where L=0.5
- NDRE = (NIR - RedEdge) / (NIR + RedEdge)
- CI_RE = (NIR / RedEdge) - 1
- NDWI = (Green - NIR) / (Green + NIR)
Args:
red: Red band (B4)
nir: NIR band (B8)
blue: Blue band (B2, optional)
green: Green band (B3, optional)
swir1: SWIR1 band (B11, optional)
swir2: SWIR2 band (B12, optional)
Returns:
Dict mapping index name to array
"""
indices = {}
# Ensure float64 for precision
nir = nir.astype(np.float64)
red = red.astype(np.float64)
# NDVI = (NIR - Red) / (NIR + Red)
denominator = nir + red
indices['ndvi'] = np.where(denominator != 0, (nir - red) / denominator, 0)
# EVI = 2.5 * (NIR - Red) / (NIR + 6*Red - 7.5*Blue + 1)
if blue is not None:
blue = blue.astype(np.float64)
evi_denom = nir + 6*red - 7.5*blue + 1
indices['evi'] = np.where(evi_denom != 0, 2.5 * (nir - red) / evi_denom, 0)
# SAVI = ((NIR - Red) / (NIR + Red + L)) * (1 + L) where L=0.5
L = 0.5
savi_denom = nir + red + L
indices['savi'] = np.where(savi_denom != 0, ((nir - red) / savi_denom) * (1 + L), 0)
# NDRE = (NIR - RedEdge) / (NIR + RedEdge)
# RedEdge is typically B5 (705nm) - use NIR if not available
if 'rededge' in locals() and rededge is not None:
rededge = rededge.astype(np.float64)
ndre_denom = nir + rededge
indices['ndre'] = np.where(ndre_denom != 0, (nir - rededge) / ndre_denom, 0)
# CI_RE = (NIR / RedEdge) - 1
indices['ci_re'] = np.where(rededge != 0, (nir / rededge) - 1, 0)
else:
# Fallback: use SWIR1 as proxy for red-edge if available
if swir1 is not None:
swir1 = swir1.astype(np.float64)
ndre_denom = nir + swir1
indices['ndre'] = np.where(ndre_denom != 0, (nir - swir1) / ndre_denom, 0)
indices['ci_re'] = np.where(swir1 != 0, (nir / swir1) - 1, 0)
# NDWI = (Green - NIR) / (Green + NIR)
if green is not None:
green = green.astype(np.float64)
ndwi_denom = green + nir
indices['ndwi'] = np.where(ndwi_denom != 0, (green - nir) / ndwi_denom, 0)
return indices
def build_feature_stack_from_dea(
cfg,
aoi: AOI,
start_date: str,
end_date: str,
target_profile: dict,
) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]:
"""Query DEA STAC and compute a per-pixel feature cube.
This function implements the FULL feature engineering pipeline matching train.py:
1. Load Sentinel-2 data from DEA STAC
2. Compute indices (ndvi, ndre, evi, savi, ci_re, ndwi)
3. Apply Savitzky-Golay smoothing with 0-interpolation
4. Extract phenology metrics (amplitude, AUC, peak, slope)
5. Add harmonic/fourier features
6. Add seasonal window statistics
7. Add index interactions
Returns:
feat_arr: (H, W, C)
feat_profile: raster profile aligned to target_profile
feat_names: list[str]
aux_layers: dict for extra outputs (true_color, ndvi, evi, savi)
"""
# Import STAC dependencies
try:
import pystac_client
import stackstac
except ImportError:
raise ImportError("pystac-client and stackstac are required for DEA STAC loading")
from scipy.signal import savgol_filter
from scipy.integrate import trapezoid
H = target_profile["height"]
W = target_profile["width"]
# DEA STAC configuration
stac_url = cfg.dea_stac_url if hasattr(cfg, 'dea_stac_url') else "https://explorer.digitalearth.africa/stac"
# AOI to bbox
lon, lat, radius_m = aoi
deg = radius_m / 111_320.0
bbox = [lon - deg, lat - deg, lon + deg, lat + deg]
# Query DEA STAC
print(f"🔍 Querying DEA STAC: {stac_url}")
print(f" _bbox: {bbox}")
print(f" _dates: {start_date} to {end_date}")
try:
client = pystac_client.Client.open(stac_url)
# Search for Sentinel-2 L2A
search = client.search(
collections=["s2_l2a"],
bbox=bbox,
datetime=f"{start_date}/{end_date}",
query={
"eo:cloud_cover": {"lt": 30}, # Cloud filter
}
)
items = list(search.items())
print(f" Found {len(items)} Sentinel-2 scenes")
if len(items) == 0:
raise ValueError("No Sentinel-2 imagery available for the selected AOI and date range")
# Load data using stackstac
# Required bands: red, green, blue, nir, rededge (B5), swir1, swir2
bands = ["red", "green", "blue", "nir", "nir08", "nir09", "swir16", "swir22"]
cube = stackstac.stack(
items,
bounds=bbox,
resolution=10, # 10m (Sentinel-2 native)
bands=bands,
chunks={"x": 512, "y": 512},
epsg=32736, # UTM Zone 36S (Zimbabwe)
)
print(f" Loaded cube shape: {cube.shape}")
except Exception as e:
print(f" ⚠️ DEA STAC loading failed: {e}")
print(f" Returning placeholder features for development")
return _build_placeholder_features(H, W, target_profile)
# Extract dates from the cube
cube_dates = pd.to_datetime(cube.time.values)
date_strings = [d.strftime('%Y%m%d') for d in cube_dates]
# Get band data - stackstac returns (T, C, H, W), transpose to (C, T, H, W)
band_data = cube.values # (T, C, H, W)
n_times = band_data.shape[0]
# Map bands to names
band_names = list(cube.band.values)
# Extract individual bands
def get_band_data(band_name):
idx = band_names.index(band_name) if band_name in band_names else 0
# Shape: (T, H, W)
return band_data[:, idx, :, :]
# Build timeseries dict for each index
# Compute indices for each timestep
indices_list = []
# Get available bands
available_bands = {}
for bn in ['red', 'green', 'blue', 'nir', 'nir08', 'nir09', 'swir16', 'swir22']:
if bn in band_names:
available_bands[bn] = get_band_data(bn)
# Compute indices for each timestep
timeseries_dict = {}
for t in range(n_times):
# Get bands for this timestep
bands_t = {k: v[t] for k, v in available_bands.items()}
# Compute indices
red = bands_t.get('red', None)
nir = bands_t.get('nir', None)
green = bands_t.get('green', None)
blue = bands_t.get('blue', None)
nir08 = bands_t.get('nir08', None) # B8A (red-edge)
swir16 = bands_t.get('swir16', None) # B11
swir22 = bands_t.get('swir22', None) # B12
if red is None or nir is None:
continue
# Compute indices at this timestep
# Use nir08 as red-edge if available, else swir16 as proxy
rededge = nir08 if nir08 is not None else (swir16 if swir16 is not None else None)
indices_t = compute_indices_from_bands(
red=red,
nir=nir,
blue=blue,
green=green,
swir1=swir16,
swir2=swir22
)
# Add NDRE and CI_RE if we have red-edge
if rededge is not None:
denom = nir + rededge
indices_t['ndre'] = np.where(denom != 0, (nir - rededge) / denom, 0)
indices_t['ci_re'] = np.where(rededge != 0, (nir / rededge) - 1, 0)
# Stack into timeseries
for idx_name, idx_arr in indices_t.items():
if idx_name not in timeseries_dict:
timeseries_dict[idx_name] = np.zeros((H, W, n_times), dtype=np.float32)
timeseries_dict[idx_name][:, :, t] = idx_arr.astype(np.float32)
# Ensure at least one index exists
if not timeseries_dict:
print(" ⚠️ No indices computed, returning placeholders")
return _build_placeholder_features(H, W, target_profile)
# ========================================
# Apply Feature Engineering Pipeline
# (matching train.py exactly)
# ========================================
print(" 🔧 Applying feature engineering pipeline...")
# 1. Apply smoothing (Savitzky-Golay)
print(" - Smoothing (Savitzky-Golay window=5, polyorder=2)")
smoothed_dict = apply_smoothing_to_rasters(timeseries_dict, date_strings)
# 2. Extract phenology
print(" - Phenology metrics (amplitude, AUC, peak, slope)")
phenology_features = extract_phenology_from_rasters(
smoothed_dict, date_strings,
indices=['ndvi', 'ndre', 'evi', 'savi']
)
# 3. Add harmonics
print(" - Harmonic features (1st/2nd order sin/cos)")
harmonic_features = add_harmonics_to_rasters(
smoothed_dict, date_strings,
indices=['ndvi', 'ndre', 'evi']
)
# 4. Seasonal windows + interactions
print(" - Seasonal windows (Early/Peak/Late) + interactions")
window_features = add_seasonal_windows_and_interactions(
smoothed_dict, date_strings,
indices=['ndvi', 'ndwi', 'ndre'],
phenology_features=phenology_features
)
# ========================================
# Combine all features
# ========================================
# Collect all features in order
all_features = {}
all_features.update(phenology_features)
all_features.update(harmonic_features)
all_features.update(window_features)
# Get feature names in consistent order
# Order: phenology (ndvi) -> phenology (ndre) -> phenology (evi) -> phenology (savi)
# -> harmonics -> windows -> interactions
feat_names = []
# Phenology order: ndvi, ndre, evi, savi
for idx in ['ndvi', 'ndre', 'evi', 'savi']:
for suffix in ['_max', '_min', '_mean', '_std', '_amplitude', '_auc', '_peak_timestep', '_max_slope_up', '_max_slope_down']:
key = f'{idx}{suffix}'
if key in all_features:
feat_names.append(key)
# Harmonics order: ndvi, ndre, evi
for idx in ['ndvi', 'ndre', 'evi']:
for suffix in ['_harmonic1_sin', '_harmonic1_cos', '_harmonic2_sin', '_harmonic2_cos']:
key = f'{idx}{suffix}'
if key in all_features:
feat_names.append(key)
# Window features: ndvi, ndwi, ndre (early, peak, late)
for idx in ['ndvi', 'ndwi', 'ndre']:
for win in ['early', 'peak', 'late']:
for stat in ['_mean', '_max']:
key = f'{idx}_{win}{stat}'
if key in all_features:
feat_names.append(key)
# Interactions
if 'ndvi_ndre_peak_diff' in all_features:
feat_names.append('ndvi_ndre_peak_diff')
if 'canopy_density_contrast' in all_features:
feat_names.append('canopy_density_contrast')
print(f" Total features: {len(feat_names)}")
# Build feature array
feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32)
for i, feat_name in enumerate(feat_names):
if feat_name in all_features:
feat_arr[:, :, i] = all_features[feat_name]
# Handle NaN/Inf
feat_arr = np.nan_to_num(feat_arr, nan=0.0, posinf=0.0, neginf=0.0)
# ========================================
# Build aux layers for visualization
# ========================================
aux_layers = {}
# True color (use first clear observation)
if 'red' in available_bands and 'green' in available_bands and 'blue' in available_bands:
# Get median of clear observations
red_arr = available_bands['red'] # (T, H, W)
green_arr = available_bands['green']
blue_arr = available_bands['blue']
# Simple median composite
tc = np.stack([
np.median(red_arr, axis=0),
np.median(green_arr, axis=0),
np.median(blue_arr, axis=0),
], axis=-1)
aux_layers['true_color'] = tc.astype(np.uint16)
# Index peaks for visualization
for idx in ['ndvi', 'evi', 'savi']:
if f'{idx}_max' in all_features:
aux_layers[f'{idx}_peak'] = all_features[f'{idx}_max']
feat_profile = target_profile.copy()
feat_profile.update({"count": 1, "dtype": "float32"})
return feat_arr, feat_profile, feat_names, aux_layers
def _build_placeholder_features(H: int, W: int, target_profile: dict) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]:
"""Build placeholder features when DEA STAC is unavailable.
This allows the pipeline to run during development without API access.
"""
# Minimal feature set matching training expected features
feat_names = ["ndvi_peak", "evi_peak", "savi_peak"]
feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32)
aux_layers = {
"true_color": np.zeros((H, W, 3), dtype=np.uint16),
"ndvi_peak": np.zeros((H, W), dtype=np.float32),
"evi_peak": np.zeros((H, W), dtype=np.float32),
"savi_peak": np.zeros((H, W), dtype=np.float32),
}
feat_profile = target_profile.copy()
feat_profile.update({"count": 1, "dtype": "float32"})
return feat_arr, feat_profile, feat_names, aux_layers
# -------------------------
# Neighborhood smoothing
# -------------------------
def majority_filter(arr: np.ndarray, k: int = 3) -> np.ndarray:
"""Majority filter for 2D class label arrays.
arr may be dtype string (labels) or integers. For strings, we use a slower
path with unique counts.
k must be odd (3,5,7).
NOTE: This is a simple CPU implementation. For speed:
- convert labels to ints
- use scipy.ndimage or numba
- or apply with rasterio/gdal focal statistics
"""
if k % 2 == 0 or k < 3:
raise ValueError("k must be odd and >= 3")
pad = k // 2
H, W = arr.shape
padded = np.pad(arr, ((pad, pad), (pad, pad)), mode="edge")
out = arr.copy()
# If numeric, use bincount fast path
if np.issubdtype(arr.dtype, np.integer):
maxv = int(arr.max()) if arr.size else 0
for y in range(H):
for x in range(W):
win = padded[y : y + k, x : x + k].ravel()
counts = np.bincount(win, minlength=maxv + 1)
out[y, x] = counts.argmax()
return out
# String/obj path
for y in range(H):
for x in range(W):
win = padded[y : y + k, x : x + k].ravel()
vals, counts = np.unique(win, return_counts=True)
out[y, x] = vals[counts.argmax()]
return out