geocrop-platform./training/features.py

"""Feature engineering + geospatial helpers for GeoCrop.

This module is shared by training (feature selection + scaling helpers)
AND inference (DEA STAC fetch + raster alignment + smoothing).

Roo Code will likely extend this file significantly.
"""

from __future__ import annotations

import json
from dataclasses import dataclass
from datetime import date
from typing import Dict, Iterable, List, Optional, Tuple

import numpy as np
import pandas as pd

# Raster / geo
import rasterio
from rasterio.enums import Resampling


# ==========================================
# Training helpers
# ==========================================

def drop_junk_columns(df: pd.DataFrame, junk_cols: List[str]) -> pd.DataFrame:
    cols_to_drop = [c for c in junk_cols if c in df.columns]
    return df.drop(columns=cols_to_drop)


def scout_feature_selection(
    X_train: pd.DataFrame,
    y_train: np.ndarray,
    n_estimators: int = 100,
    random_state: int = 42,
) -> List[str]:
    """Scout LightGBM feature selection (keeps non-zero importances)."""
    import lightgbm as lgb

    lgbm = lgb.LGBMClassifier(n_estimators=n_estimators, random_state=random_state, verbose=-1)
    lgbm.fit(X_train, y_train)

    importances = pd.DataFrame(
        {"Feature": X_train.columns, "Importance": lgbm.feature_importances_}
    ).sort_values("Importance", ascending=False)

    selected = importances[importances["Importance"] > 0]["Feature"].tolist()
    if not selected:
        # Fallback: keep everything (better than breaking training)
        selected = list(X_train.columns)
    return selected


def scale_numeric_features(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
):
    """Scale only numeric columns, return (X_train_scaled, X_test_scaled, scaler)."""
    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()

    num_cols = X_train.select_dtypes(include=[np.number]).columns
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()

    X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])

    return X_train_scaled, X_test_scaled, scaler


# ==========================================
# Inference helpers
# ==========================================

# AOI tuple: (lon, lat, radius_m)
AOI = Tuple[float, float, float]


def validate_aoi_zimbabwe(aoi: AOI, max_radius_m: float = 5000.0):
    """Basic AOI validation.

    - Ensures radius <= max_radius_m
    - Ensures AOI center is within rough Zimbabwe bounds.

    NOTE: For production, use a real Zimbabwe polygon and check circle intersects.
    You can load a simplified boundary GeoJSON and use shapely.
    """
    lon, lat, radius_m = aoi
    if radius_m <= 0 or radius_m > max_radius_m:
        raise ValueError(f"radius_m must be in (0, {max_radius_m}]")

    # Rough bbox for Zimbabwe (good cheap pre-check).
    # Lon: 25.2 to 33.1, Lat: -22.5 to -15.6
    if not (25.2 <= lon <= 33.1 and -22.5 <= lat <= -15.6):
        raise ValueError("AOI must be within Zimbabwe")


def clip_raster_to_aoi(
    src_path: str,
    aoi: AOI,
    dst_profile_like: Optional[dict] = None,
) -> Tuple[np.ndarray, dict]:
    """Clip a raster to AOI circle.

    Template implementation: reads a window around the circle's bbox.

    For exact circle mask, add a mask step after reading.
    """
    lon, lat, radius_m = aoi

    with rasterio.open(src_path) as src:
        # Approx bbox from radius using rough degrees conversion.
        # Production: use pyproj geodesic buffer.
        deg = radius_m / 111_320.0
        minx, maxx = lon - deg, lon + deg
        miny, maxy = lat - deg, lat + deg

        window = rasterio.windows.from_bounds(minx, miny, maxx, maxy, transform=src.transform)
        window = window.round_offsets().round_lengths()

        arr = src.read(1, window=window)
        profile = src.profile.copy()

        # Update transform for the window
        profile.update(
            {
                "height": arr.shape[0],
                "width": arr.shape[1],
                "transform": rasterio.windows.transform(window, src.transform),
            }
        )

        # Optional: resample/align to dst_profile_like
        if dst_profile_like is not None:
            arr, profile = _resample_to_profile(arr, profile, dst_profile_like)

        return arr, profile


def _resample_to_profile(arr: np.ndarray, src_profile: dict, dst_profile: dict) -> Tuple[np.ndarray, dict]:
    """Nearest-neighbor resample to match dst grid."""
    dst_h = dst_profile["height"]
    dst_w = dst_profile["width"]

    dst_arr = np.empty((dst_h, dst_w), dtype=arr.dtype)
    with rasterio.io.MemoryFile() as mem:
        with mem.open(**src_profile) as src:
            src.write(arr, 1)
            rasterio.warp.reproject(
                source=rasterio.band(src, 1),
                destination=dst_arr,
                src_transform=src_profile["transform"],
                src_crs=src_profile["crs"],
                dst_transform=dst_profile["transform"],
                dst_crs=dst_profile["crs"],
                resampling=Resampling.nearest,
            )

    prof = dst_profile.copy()
    prof.update({"count": 1, "dtype": str(dst_arr.dtype)})
    return dst_arr, prof


def load_dw_baseline_window(cfg, year: int, season: str, aoi: AOI) -> Tuple[np.ndarray, dict]:
    """Loads the DW baseline seasonal COG from MinIO and clips to AOI.

    The cfg.storage implementation decides whether to stream or download locally.

    Expected naming convention:
      dw_{season}_{year}.tif  OR  DW_Zim_HighestConf_2015_2016.tif

    You can implement a mapping in cfg.dw_key_for(year, season).
    """
    local_path = cfg.storage.get_dw_local_path(year=year, season=season)
    arr, profile = clip_raster_to_aoi(local_path, aoi)

    # Ensure a single band profile
    profile.update({"count": 1})
    if "dtype" not in profile:
        profile["dtype"] = str(arr.dtype)

    return arr, profile


# -------------------------
# DEA STAC feature stack
# -------------------------

def build_feature_stack_from_dea(
    cfg,
    aoi: AOI,
    start_date: str,
    end_date: str,
    target_profile: dict,
) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]:
    """Query DEA STAC and compute a per-pixel feature cube.

    Returns:
      feat_arr: (H, W, C)
      feat_profile: raster profile aligned to target_profile
      feat_names: list[str]
      aux_layers: dict for extra outputs (true_color, ndvi, evi, savi)

    Implementation strategy (recommended):
      - Use pystac-client + stackstac or odc-stac to load xarray
      - Reproject/resample to target grid (10m)
      - Compute composites (e.g., median or best-pixel)
      - Compute indices

    For now this is a stub returning zeros so the pipeline wiring works.
    """

    H = target_profile["height"]
    W = target_profile["width"]

    # Placeholder features — Roo Code will replace with real DEA loading.
    feat_names = ["ndvi_peak", "evi_peak", "savi_peak"]
    feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32)

    aux_layers = {
        "true_color": np.zeros((H, W, 3), dtype=np.uint16),
        "ndvi_peak": np.zeros((H, W), dtype=np.float32),
        "evi_peak": np.zeros((H, W), dtype=np.float32),
        "savi_peak": np.zeros((H, W), dtype=np.float32),
    }

    feat_profile = target_profile.copy()
    feat_profile.update({"count": 1, "dtype": "float32"})

    return feat_arr, feat_profile, feat_names, aux_layers


# -------------------------
# Neighborhood smoothing
# -------------------------

def majority_filter(arr: np.ndarray, k: int = 3) -> np.ndarray:
    """Majority filter for 2D class label arrays.

    arr may be dtype string (labels) or integers. For strings, we use a slower
    path with unique counts.

    k must be odd (3,5,7).

    NOTE: This is a simple CPU implementation. For speed:
      - convert labels to ints
      - use scipy.ndimage or numba
      - or apply with rasterio/gdal focal statistics
    """
    if k % 2 == 0 or k < 3:
        raise ValueError("k must be odd and >= 3")

    pad = k // 2
    H, W = arr.shape
    padded = np.pad(arr, ((pad, pad), (pad, pad)), mode="edge")

    out = arr.copy()

    # If numeric, use bincount fast path
    if np.issubdtype(arr.dtype, np.integer):
        maxv = int(arr.max()) if arr.size else 0
        for y in range(H):
            for x in range(W):
                win = padded[y : y + k, x : x + k].ravel()
                counts = np.bincount(win, minlength=maxv + 1)
                out[y, x] = counts.argmax()
        return out

    # String/obj path
    for y in range(H):
        for x in range(W):
            win = padded[y : y + k, x : x + k].ravel()
            vals, counts = np.unique(win, return_counts=True)
            out[y, x] = vals[counts.argmax()]

    return out