geocrop-platform./apps/worker/features.py

"""Feature engineering + geospatial helpers for GeoCrop.

This module is shared by training (feature selection + scaling helpers)
AND inference (DEA STAC fetch + raster alignment + smoothing).

IMPORTANT: This implementation exactly replicates train.py feature engineering:
- Savitzky-Golay smoothing (window=5, polyorder=2) with 0-interpolation
- Phenology metrics (amplitude, AUC, peak_timestep, max_slope)
- Harmonic/Fourier features (1st and 2nd order sin/cos)
- Seasonal window statistics (Early: Oct-Dec, Peak: Jan-Mar, Late: Apr-Jun)
"""

from __future__ import annotations

import json
import re
from dataclasses import dataclass
from datetime import date
from typing import Dict, Iterable, List, Optional, Tuple

import numpy as np
import pandas as pd

# Raster / geo
import rasterio
from rasterio.enums import Resampling


# ==========================================
# Training helpers
# ==========================================

def drop_junk_columns(df: pd.DataFrame, junk_cols: List[str]) -> pd.DataFrame:
    """Drop junk/spatial columns that would cause data leakage.

    Matches train.py junk_cols: ['.geo', 'system:index', 'latitude', 'longitude',
    'lat', 'lon', 'ID', 'parent_id', 'batch_id', 'is_syn']
    """
    cols_to_drop = [c for c in junk_cols if c in df.columns]
    return df.drop(columns=cols_to_drop)


def scout_feature_selection(
    X_train: pd.DataFrame,
    y_train: np.ndarray,
    n_estimators: int = 100,
    random_state: int = 42,
) -> List[str]:
    """Scout LightGBM feature selection (keeps non-zero importances)."""
    import lightgbm as lgb

    lgbm = lgb.LGBMClassifier(n_estimators=n_estimators, random_state=random_state, verbose=-1)
    lgbm.fit(X_train, y_train)

    importances = pd.DataFrame(
        {"Feature": X_train.columns, "Importance": lgbm.feature_importances_}
    ).sort_values("Importance", ascending=False)

    selected = importances[importances["Importance"] > 0]["Feature"].tolist()
    if not selected:
        # Fallback: keep everything (better than breaking training)
        selected = list(X_train.columns)
    return selected


def scale_numeric_features(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
):
    """Scale only numeric columns, return (X_train_scaled, X_test_scaled, scaler).

    Uses StandardScaler (matches train.py).
    """
    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()

    num_cols = X_train.select_dtypes(include=[np.number]).columns
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()

    X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])

    return X_train_scaled, X_test_scaled, scaler


# ==========================================
# INFERENCE-ONLY FEATURE ENGINEERING
# These functions replicate train.py for raster-based inference
# ==========================================

def apply_smoothing_to_rasters(
    timeseries_dict: Dict[str, np.ndarray],
    dates: List[str]
) -> Dict[str, np.ndarray]:
    """Apply Savitzky-Golay smoothing to time-series raster arrays.

    Replicates train.py apply_smoothing():
    1. Replace 0 with NaN
    2. Linear interpolate across time axis, fillna(0)
    3. Savitzky-Golay: window_length=5, polyorder=2

    Args:
        timeseries_dict: Dict mapping index name to (H, W, T) array
        dates: List of date strings in YYYYMMDD format

    Returns:
        Dict mapping index name to smoothed (H, W, T) array
    """
    from scipy.signal import savgol_filter

    smoothed = {}
    n_times = len(dates)

    for idx_name, arr in timeseries_dict.items():
        # arr shape: (H, W, T)
        H, W, T = arr.shape

        # Reshape to (H*W, T) for vectorized processing
        arr_2d = arr.reshape(-1, T)

        # 1. Replace 0 with NaN
        arr_2d = np.where(arr_2d == 0, np.nan, arr_2d)

        # 2. Linear interpolate across time axis (axis=1)
        # Handle each row (each pixel) independently
        interp_rows = []
        for row in arr_2d:
            # Use pandas Series for linear interpolation
            ser = pd.Series(row)
            ser = ser.interpolate(method='linear', limit_direction='both')
            interp_rows.append(ser.fillna(0).values)
        interp_arr = np.array(interp_rows)

        # 3. Apply Savitzky-Golay smoothing
        # window_length=5, polyorder=2
        smooth_arr = savgol_filter(interp_arr, window_length=5, polyorder=2, axis=1)

        # Reshape back to (H, W, T)
        smoothed[idx_name] = smooth_arr.reshape(H, W, T)

    return smoothed


def extract_phenology_from_rasters(
    timeseries_dict: Dict[str, np.ndarray],
    dates: List[str],
    indices: List[str] = ['ndvi', 'ndre', 'evi']
) -> Dict[str, np.ndarray]:
    """Extract phenology metrics from time-series raster arrays.

    Replicates train.py extract_phenology():
    - Magnitude: max, min, mean, std, amplitude
    - AUC: trapezoid integral with dx=10
    - Timing: peak_timestep (argmax)
    - Slopes: max_slope_up, max_slope_down

    Args:
        timeseries_dict: Dict mapping index name to (H, W, T) array (should be smoothed)
        dates: List of date strings
        indices: Which indices to process

    Returns:
        Dict mapping feature name to (H, W) array
    """
    from scipy.integrate import trapezoid

    features = {}

    for idx in indices:
        if idx not in timeseries_dict:
            continue

        arr = timeseries_dict[idx]  # (H, W, T)
        H, W, T = arr.shape

        # Reshape to (H*W, T) for vectorized processing
        arr_2d = arr.reshape(-1, T)

        # Magnitude Metrics
        features[f'{idx}_max'] = np.max(arr_2d, axis=1).reshape(H, W)
        features[f'{idx}_min'] = np.min(arr_2d, axis=1).reshape(H, W)
        features[f'{idx}_mean'] = np.mean(arr_2d, axis=1).reshape(H, W)
        features[f'{idx}_std'] = np.std(arr_2d, axis=1).reshape(H, W)
        features[f'{idx}_amplitude'] = features[f'{idx}_max'] - features[f'{idx}_min']

        # AUC (Area Under Curve) with dx=10 (10-day intervals)
        features[f'{idx}_auc'] = trapezoid(arr_2d, dx=10, axis=1).reshape(H, W)

        # Peak timestep (timing)
        peak_indices = np.argmax(arr_2d, axis=1)
        features[f'{idx}_peak_timestep'] = peak_indices.reshape(H, W)

        # Slopes (rates of change)
        slopes = np.diff(arr_2d, axis=1)  # (H*W, T-1)
        features[f'{idx}_max_slope_up'] = np.max(slopes, axis=1).reshape(H, W)
        features[f'{idx}_max_slope_down'] = np.min(slopes, axis=1).reshape(H, W)

    return features


def add_harmonics_to_rasters(
    timeseries_dict: Dict[str, np.ndarray],
    dates: List[str],
    indices: List[str] = ['ndvi']
) -> Dict[str, np.ndarray]:
    """Add harmonic/fourier features from time-series raster arrays.

    Replicates train.py add_harmonics():
    - 1st order: sin(t), cos(t)
    - 2nd order: sin(2t), cos(2t)
    where t = 2*pi * time_step / n_times

    Args:
        timeseries_dict: Dict mapping index name to (H, W, T) array (should be smoothed)
        dates: List of date strings
        indices: Which indices to process

    Returns:
        Dict mapping feature name to (H, W) array
    """
    features = {}
    n_times = len(dates)

    # Normalize time to 0-2pi (one full cycle)
    time_steps = np.arange(n_times)
    t = 2 * np.pi * time_steps / n_times

    sin_t = np.sin(t)
    cos_t = np.cos(t)
    sin_2t = np.sin(2 * t)
    cos_2t = np.cos(2 * t)

    for idx in indices:
        if idx not in timeseries_dict:
            continue

        arr = timeseries_dict[idx]  # (H, W, T)
        H, W, T = arr.shape

        # Reshape to (H*W, T) for vectorized processing
        arr_2d = arr.reshape(-1, T)

        # Normalized dot products (harmonic coefficients)
        features[f'{idx}_harmonic1_sin'] = np.dot(arr_2d, sin_t) / n_times
        features[f'{idx}_harmonic1_cos'] = np.dot(arr_2d, cos_t) / n_times
        features[f'{idx}_harmonic2_sin'] = np.dot(arr_2d, sin_2t) / n_times
        features[f'{idx}_harmonic2_cos'] = np.dot(arr_2d, cos_2t) / n_times

        # Reshape back to (H, W)
        for feat_name in [f'{idx}_harmonic1_sin', f'{idx}_harmonic1_cos',
                          f'{idx}_harmonic2_sin', f'{idx}_harmonic2_cos']:
            features[feat_name] = features[feat_name].reshape(H, W)

    return features


def add_seasonal_windows_and_interactions(
    timeseries_dict: Dict[str, np.ndarray],
    dates: List[str],
    indices: List[str] = ['ndvi', 'ndwi', 'ndre'],
    phenology_features: Dict[str, np.ndarray] = None
) -> Dict[str, np.ndarray]:
    """Add seasonal window statistics and index interactions.

    Replicates train.py add_interactions_and_windows():
    - Seasonal windows (Zimbabwe season: Oct-Jun):
      - Early: Oct-Dec (months 10, 11, 12)
      - Peak: Jan-Mar (months 1, 2, 3)
      - Late: Apr-Jun (months 4, 5, 6)
    - Interactions:
      - ndvi_ndre_peak_diff = ndvi_max - ndre_max
      - canopy_density_contrast = evi_mean / (ndvi_mean + 0.001)

    Args:
        timeseries_dict: Dict mapping index name to (H, W, T) array
        dates: List of date strings in YYYYMMDD format
        indices: Which indices to process
        phenology_features: Dict of phenology features for interactions

    Returns:
        Dict mapping feature name to (H, W) array
    """
    features = {}

    # Parse dates to identify months
    dt_dates = pd.to_datetime(dates, format='%Y%m%d')

    # Define seasonal windows (months)
    windows = {
        'early': [10, 11, 12],   # Oct-Dec
        'peak': [1, 2, 3],       # Jan-Mar
        'late': [4, 5, 6]        # Apr-Jun
    }

    for idx in indices:
        if idx not in timeseries_dict:
            continue

        arr = timeseries_dict[idx]  # (H, W, T)
        H, W, T = arr.shape

        for win_name, months in windows.items():
            # Find time indices belonging to this window
            month_mask = np.array([d.month in months for d in dt_dates])

            if not np.any(month_mask):
                continue

            # Extract window slice
            window_arr = arr[:, :, month_mask]  # (H, W, T_window)

            # Compute statistics
            window_2d = window_arr.reshape(-1, window_arr.shape[2])
            features[f'{idx}_{win_name}_mean'] = np.mean(window_2d, axis=1).reshape(H, W)
            features[f'{idx}_{win_name}_max'] = np.max(window_2d, axis=1).reshape(H, W)

    # Add interactions (if phenology features available)
    if phenology_features is not None:
        # ndvi_ndre_peak_diff
        if 'ndvi_max' in phenology_features and 'ndre_max' in phenology_features:
            features['ndvi_ndre_peak_diff'] = (
                phenology_features['ndvi_max'] - phenology_features['ndre_max']
            )

        # canopy_density_contrast
        if 'evi_mean' in phenology_features and 'ndvi_mean' in phenology_features:
            features['canopy_density_contrast'] = (
                phenology_features['evi_mean'] / (phenology_features['ndvi_mean'] + 0.001)
            )

    return features


# ==========================================
# Inference helpers
# ==========================================

# AOI tuple: (lon, lat, radius_m)
AOI = Tuple[float, float, float]


def validate_aoi_zimbabwe(aoi: AOI, max_radius_m: float = 5000.0):
    """Basic AOI validation.

    - Ensures radius <= max_radius_m
    - Ensures AOI center is within rough Zimbabwe bounds.

    NOTE: For production, use a real Zimbabwe polygon and check circle intersects.
    You can load a simplified boundary GeoJSON and use shapely.
    """
    lon, lat, radius_m = aoi
    if radius_m <= 0 or radius_m > max_radius_m:
        raise ValueError(f"radius_m must be in (0, {max_radius_m}]")

    # Rough bbox for Zimbabwe (good cheap pre-check).
    # Lon: 25.2 to 33.1, Lat: -22.5 to -15.6
    if not (25.2 <= lon <= 33.1 and -22.5 <= lat <= -15.6):
        raise ValueError("AOI must be within Zimbabwe")


def clip_raster_to_aoi(
    src_path: str,
    aoi: AOI,
    dst_profile_like: Optional[dict] = None,
) -> Tuple[np.ndarray, dict]:
    """Clip a raster to AOI circle.

    Template implementation: reads a window around the circle's bbox.

    For exact circle mask, add a mask step after reading.
    """
    lon, lat, radius_m = aoi

    with rasterio.open(src_path) as src:
        # Approx bbox from radius using rough degrees conversion.
        # Production: use pyproj geodesic buffer.
        deg = radius_m / 111_320.0
        minx, maxx = lon - deg, lon + deg
        miny, maxy = lat - deg, lat + deg

        window = rasterio.windows.from_bounds(minx, miny, maxx, maxy, transform=src.transform)
        window = window.round_offsets().round_lengths()

        arr = src.read(1, window=window)
        profile = src.profile.copy()

        # Update transform for the window
        profile.update(
            {
                "height": arr.shape[0],
                "width": arr.shape[1],
                "transform": rasterio.windows.transform(window, src.transform),
            }
        )

        # Optional: resample/align to dst_profile_like
        if dst_profile_like is not None:
            arr, profile = _resample_to_profile(arr, profile, dst_profile_like)

        return arr, profile


def _resample_to_profile(arr: np.ndarray, src_profile: dict, dst_profile: dict) -> Tuple[np.ndarray, dict]:
    """Nearest-neighbor resample to match dst grid."""
    dst_h = dst_profile["height"]
    dst_w = dst_profile["width"]

    dst_arr = np.empty((dst_h, dst_w), dtype=arr.dtype)
    with rasterio.io.MemoryFile() as mem:
        with mem.open(**src_profile) as src:
            src.write(arr, 1)
            rasterio.warp.reproject(
                source=rasterio.band(src, 1),
                destination=dst_arr,
                src_transform=src_profile["transform"],
                src_crs=src_profile["crs"],
                dst_transform=dst_profile["transform"],
                dst_crs=dst_profile["crs"],
                resampling=Resampling.nearest,
            )

    prof = dst_profile.copy()
    prof.update({"count": 1, "dtype": str(dst_arr.dtype)})
    return dst_arr, prof


def load_dw_baseline_window(cfg, year: int, season: str, aoi: AOI) -> Tuple[np.ndarray, dict]:
    """Loads the DW baseline seasonal COG from MinIO and clips to AOI.

    The cfg.storage implementation decides whether to stream or download locally.

    Expected naming convention:
      dw_{season}_{year}.tif  OR  DW_Zim_HighestConf_{year}_{year+1}.tif

    You can implement a mapping in cfg.dw_key_for(year, season).
    """
    local_path = cfg.storage.get_dw_local_path(year=year, season=season)
    arr, profile = clip_raster_to_aoi(local_path, aoi)

    # Ensure a single band profile
    profile.update({"count": 1})
    if "dtype" not in profile:
        profile["dtype"] = str(arr.dtype)

    return arr, profile


# -------------------------
# DEA STAC feature stack
# -------------------------

def compute_indices_from_bands(
    red: np.ndarray,
    nir: np.ndarray,
    blue: np.ndarray = None,
    green: np.ndarray = None,
    swir1: np.ndarray = None,
    swir2: np.ndarray = None
) -> Dict[str, np.ndarray]:
    """Compute vegetation indices from band arrays.

    Indices computed:
    - NDVI = (NIR - Red) / (NIR + Red)
    - EVI = 2.5 * (NIR - Red) / (NIR + 6*Red - 7.5*Blue + 1)
    - SAVI = ((NIR - Red) / (NIR + Red + L)) * (1 + L) where L=0.5
    - NDRE = (NIR - RedEdge) / (NIR + RedEdge)
    - CI_RE = (NIR / RedEdge) - 1
    - NDWI = (Green - NIR) / (Green + NIR)

    Args:
        red: Red band (B4)
        nir: NIR band (B8)
        blue: Blue band (B2, optional)
        green: Green band (B3, optional)
        swir1: SWIR1 band (B11, optional)
        swir2: SWIR2 band (B12, optional)

    Returns:
        Dict mapping index name to array
    """
    indices = {}

    # Ensure float64 for precision
    nir = nir.astype(np.float64)
    red = red.astype(np.float64)

    # NDVI = (NIR - Red) / (NIR + Red)
    denominator = nir + red
    indices['ndvi'] = np.where(denominator != 0, (nir - red) / denominator, 0)

    # EVI = 2.5 * (NIR - Red) / (NIR + 6*Red - 7.5*Blue + 1)
    if blue is not None:
        blue = blue.astype(np.float64)
        evi_denom = nir + 6*red - 7.5*blue + 1
        indices['evi'] = np.where(evi_denom != 0, 2.5 * (nir - red) / evi_denom, 0)

    # SAVI = ((NIR - Red) / (NIR + Red + L)) * (1 + L) where L=0.5
    L = 0.5
    savi_denom = nir + red + L
    indices['savi'] = np.where(savi_denom != 0, ((nir - red) / savi_denom) * (1 + L), 0)

    # NDRE = (NIR - RedEdge) / (NIR + RedEdge)
    # RedEdge is typically B5 (705nm) - use NIR if not available
    if 'rededge' in locals() and rededge is not None:
        rededge = rededge.astype(np.float64)
        ndre_denom = nir + rededge
        indices['ndre'] = np.where(ndre_denom != 0, (nir - rededge) / ndre_denom, 0)
        # CI_RE = (NIR / RedEdge) - 1
        indices['ci_re'] = np.where(rededge != 0, (nir / rededge) - 1, 0)
    else:
        # Fallback: use SWIR1 as proxy for red-edge if available
        if swir1 is not None:
            swir1 = swir1.astype(np.float64)
            ndre_denom = nir + swir1
            indices['ndre'] = np.where(ndre_denom != 0, (nir - swir1) / ndre_denom, 0)
            indices['ci_re'] = np.where(swir1 != 0, (nir / swir1) - 1, 0)

    # NDWI = (Green - NIR) / (Green + NIR)
    if green is not None:
        green = green.astype(np.float64)
        ndwi_denom = green + nir
        indices['ndwi'] = np.where(ndwi_denom != 0, (green - nir) / ndwi_denom, 0)

    return indices


def build_feature_stack_from_dea(
    cfg,
    aoi: AOI,
    start_date: str,
    end_date: str,
    target_profile: dict,
) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]:
    """Query DEA STAC and compute a per-pixel feature cube.

    This function implements the FULL feature engineering pipeline matching train.py:
    1. Load Sentinel-2 data from DEA STAC
    2. Compute indices (ndvi, ndre, evi, savi, ci_re, ndwi)
    3. Apply Savitzky-Golay smoothing with 0-interpolation
    4. Extract phenology metrics (amplitude, AUC, peak, slope)
    5. Add harmonic/fourier features
    6. Add seasonal window statistics
    7. Add index interactions

    Returns:
      feat_arr: (H, W, C)
      feat_profile: raster profile aligned to target_profile
      feat_names: list[str]
      aux_layers: dict for extra outputs (true_color, ndvi, evi, savi)

    """
    # Import STAC dependencies
    try:
        import pystac_client
        import stackstac
    except ImportError:
        raise ImportError("pystac-client and stackstac are required for DEA STAC loading")

    from scipy.signal import savgol_filter
    from scipy.integrate import trapezoid

    H = target_profile["height"]
    W = target_profile["width"]

    # DEA STAC configuration
    stac_url = cfg.dea_stac_url if hasattr(cfg, 'dea_stac_url') else "https://explorer.digitalearth.africa/stac"

    # AOI to bbox
    lon, lat, radius_m = aoi
    deg = radius_m / 111_320.0
    bbox = [lon - deg, lat - deg, lon + deg, lat + deg]

    # Query DEA STAC
    print(f"🔍 Querying DEA STAC: {stac_url}")
    print(f"  _bbox: {bbox}")
    print(f"  _dates: {start_date} to {end_date}")

    try:
        client = pystac_client.Client.open(stac_url)

        # Search for Sentinel-2 L2A
        search = client.search(
            collections=["s2_l2a"],
            bbox=bbox,
            datetime=f"{start_date}/{end_date}",
            query={
                "eo:cloud_cover": {"lt": 30},  # Cloud filter
            }
        )

        items = list(search.items())
        print(f"   Found {len(items)} Sentinel-2 scenes")

        if len(items) == 0:
            raise ValueError("No Sentinel-2 imagery available for the selected AOI and date range")

        # Load data using stackstac
        # Required bands: red, green, blue, nir, rededge (B5), swir1, swir2
        bands = ["red", "green", "blue", "nir", "nir08", "nir09", "swir16", "swir22"]

        cube = stackstac.stack(
            items,
            bounds=bbox,
            resolution=10,  # 10m (Sentinel-2 native)
            bands=bands,
            chunks={"x": 512, "y": 512},
            epsg=32736,  # UTM Zone 36S (Zimbabwe)
        )

        print(f"   Loaded cube shape: {cube.shape}")

    except Exception as e:
        print(f"   ⚠️ DEA STAC loading failed: {e}")
        print(f"   Returning placeholder features for development")
        return _build_placeholder_features(H, W, target_profile)

    # Extract dates from the cube
    cube_dates = pd.to_datetime(cube.time.values)
    date_strings = [d.strftime('%Y%m%d') for d in cube_dates]

    # Get band data - stackstac returns (T, C, H, W), transpose to (C, T, H, W)
    band_data = cube.values  # (T, C, H, W)
    n_times = band_data.shape[0]

    # Map bands to names
    band_names = list(cube.band.values)

    # Extract individual bands
    def get_band_data(band_name):
        idx = band_names.index(band_name) if band_name in band_names else 0
        # Shape: (T, H, W)
        return band_data[:, idx, :, :]

    # Build timeseries dict for each index
    # Compute indices for each timestep
    indices_list = []

    # Get available bands
    available_bands = {}
    for bn in ['red', 'green', 'blue', 'nir', 'nir08', 'nir09', 'swir16', 'swir22']:
        if bn in band_names:
            available_bands[bn] = get_band_data(bn)

    # Compute indices for each timestep
    timeseries_dict = {}

    for t in range(n_times):
        # Get bands for this timestep
        bands_t = {k: v[t] for k, v in available_bands.items()}

        # Compute indices
        red = bands_t.get('red', None)
        nir = bands_t.get('nir', None)
        green = bands_t.get('green', None)
        blue = bands_t.get('blue', None)
        nir08 = bands_t.get('nir08', None)  # B8A (red-edge)
        swir16 = bands_t.get('swir16', None)  # B11
        swir22 = bands_t.get('swir22', None)  # B12

        if red is None or nir is None:
            continue

        # Compute indices at this timestep
        # Use nir08 as red-edge if available, else swir16 as proxy
        rededge = nir08 if nir08 is not None else (swir16 if swir16 is not None else None)

        indices_t = compute_indices_from_bands(
            red=red,
            nir=nir,
            blue=blue,
            green=green,
            swir1=swir16,
            swir2=swir22
        )

        # Add NDRE and CI_RE if we have red-edge
        if rededge is not None:
            denom = nir + rededge
            indices_t['ndre'] = np.where(denom != 0, (nir - rededge) / denom, 0)
            indices_t['ci_re'] = np.where(rededge != 0, (nir / rededge) - 1, 0)

        # Stack into timeseries
        for idx_name, idx_arr in indices_t.items():
            if idx_name not in timeseries_dict:
                timeseries_dict[idx_name] = np.zeros((H, W, n_times), dtype=np.float32)
            timeseries_dict[idx_name][:, :, t] = idx_arr.astype(np.float32)

    # Ensure at least one index exists
    if not timeseries_dict:
        print("   ⚠️ No indices computed, returning placeholders")
        return _build_placeholder_features(H, W, target_profile)

    # ========================================
    # Apply Feature Engineering Pipeline
    # (matching train.py exactly)
    # ========================================

    print("   🔧 Applying feature engineering pipeline...")

    # 1. Apply smoothing (Savitzky-Golay)
    print("      - Smoothing (Savitzky-Golay window=5, polyorder=2)")
    smoothed_dict = apply_smoothing_to_rasters(timeseries_dict, date_strings)

    # 2. Extract phenology
    print("      - Phenology metrics (amplitude, AUC, peak, slope)")
    phenology_features = extract_phenology_from_rasters(
        smoothed_dict, date_strings,
        indices=['ndvi', 'ndre', 'evi', 'savi']
    )

    # 3. Add harmonics
    print("      - Harmonic features (1st/2nd order sin/cos)")
    harmonic_features = add_harmonics_to_rasters(
        smoothed_dict, date_strings,
        indices=['ndvi', 'ndre', 'evi']
    )

    # 4. Seasonal windows + interactions
    print("      - Seasonal windows (Early/Peak/Late) + interactions")
    window_features = add_seasonal_windows_and_interactions(
        smoothed_dict, date_strings,
        indices=['ndvi', 'ndwi', 'ndre'],
        phenology_features=phenology_features
    )

    # ========================================
    # Combine all features
    # ========================================

    # Collect all features in order
    all_features = {}
    all_features.update(phenology_features)
    all_features.update(harmonic_features)
    all_features.update(window_features)

    # Get feature names in consistent order
    # Order: phenology (ndvi) -> phenology (ndre) -> phenology (evi) -> phenology (savi)
    #        -> harmonics -> windows -> interactions
    feat_names = []

    # Phenology order: ndvi, ndre, evi, savi
    for idx in ['ndvi', 'ndre', 'evi', 'savi']:
        for suffix in ['_max', '_min', '_mean', '_std', '_amplitude', '_auc', '_peak_timestep', '_max_slope_up', '_max_slope_down']:
            key = f'{idx}{suffix}'
            if key in all_features:
                feat_names.append(key)

    # Harmonics order: ndvi, ndre, evi
    for idx in ['ndvi', 'ndre', 'evi']:
        for suffix in ['_harmonic1_sin', '_harmonic1_cos', '_harmonic2_sin', '_harmonic2_cos']:
            key = f'{idx}{suffix}'
            if key in all_features:
                feat_names.append(key)

    # Window features: ndvi, ndwi, ndre (early, peak, late)
    for idx in ['ndvi', 'ndwi', 'ndre']:
        for win in ['early', 'peak', 'late']:
            for stat in ['_mean', '_max']:
                key = f'{idx}_{win}{stat}'
                if key in all_features:
                    feat_names.append(key)

    # Interactions
    if 'ndvi_ndre_peak_diff' in all_features:
        feat_names.append('ndvi_ndre_peak_diff')
    if 'canopy_density_contrast' in all_features:
        feat_names.append('canopy_density_contrast')

    print(f"      Total features: {len(feat_names)}")

    # Build feature array
    feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32)
    for i, feat_name in enumerate(feat_names):
        if feat_name in all_features:
            feat_arr[:, :, i] = all_features[feat_name]

    # Handle NaN/Inf
    feat_arr = np.nan_to_num(feat_arr, nan=0.0, posinf=0.0, neginf=0.0)

    # ========================================
    # Build aux layers for visualization
    # ========================================

    aux_layers = {}

    # True color (use first clear observation)
    if 'red' in available_bands and 'green' in available_bands and 'blue' in available_bands:
        # Get median of clear observations
        red_arr = available_bands['red']  # (T, H, W)
        green_arr = available_bands['green']
        blue_arr = available_bands['blue']

        # Simple median composite
        tc = np.stack([
            np.median(red_arr, axis=0),
            np.median(green_arr, axis=0),
            np.median(blue_arr, axis=0),
        ], axis=-1)
        aux_layers['true_color'] = tc.astype(np.uint16)

    # Index peaks for visualization
    for idx in ['ndvi', 'evi', 'savi']:
        if f'{idx}_max' in all_features:
            aux_layers[f'{idx}_peak'] = all_features[f'{idx}_max']

    feat_profile = target_profile.copy()
    feat_profile.update({"count": 1, "dtype": "float32"})

    return feat_arr, feat_profile, feat_names, aux_layers


def _build_placeholder_features(H: int, W: int, target_profile: dict) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]:
    """Build placeholder features when DEA STAC is unavailable.

    This allows the pipeline to run during development without API access.
    """
    # Minimal feature set matching training expected features
    feat_names = ["ndvi_peak", "evi_peak", "savi_peak"]
    feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32)

    aux_layers = {
        "true_color": np.zeros((H, W, 3), dtype=np.uint16),
        "ndvi_peak": np.zeros((H, W), dtype=np.float32),
        "evi_peak": np.zeros((H, W), dtype=np.float32),
        "savi_peak": np.zeros((H, W), dtype=np.float32),
    }

    feat_profile = target_profile.copy()
    feat_profile.update({"count": 1, "dtype": "float32"})

    return feat_arr, feat_profile, feat_names, aux_layers


# -------------------------
# Neighborhood smoothing
# -------------------------

def majority_filter(arr: np.ndarray, k: int = 3) -> np.ndarray:
    """Majority filter for 2D class label arrays.

    arr may be dtype string (labels) or integers. For strings, we use a slower
    path with unique counts.

    k must be odd (3,5,7).

    NOTE: This is a simple CPU implementation. For speed:
      - convert labels to ints
      - use scipy.ndimage or numba
      - or apply with rasterio/gdal focal statistics
    """
    if k % 2 == 0 or k < 3:
        raise ValueError("k must be odd and >= 3")

    pad = k // 2
    H, W = arr.shape
    padded = np.pad(arr, ((pad, pad), (pad, pad)), mode="edge")

    out = arr.copy()

    # If numeric, use bincount fast path
    if np.issubdtype(arr.dtype, np.integer):
        maxv = int(arr.max()) if arr.size else 0
        for y in range(H):
            for x in range(W):
                win = padded[y : y + k, x : x + k].ravel()
                counts = np.bincount(win, minlength=maxv + 1)
                out[y, x] = counts.argmax()
        return out

    # String/obj path
    for y in range(H):
        for x in range(W):
            win = padded[y : y + k, x : x + k].ravel()
            vals, counts = np.unique(win, return_counts=True)
            out[y, x] = vals[counts.argmax()]

    return out