"""Feature engineering + geospatial helpers for GeoCrop. This module is shared by training (feature selection + scaling helpers) AND inference (DEA STAC fetch + raster alignment + smoothing). Roo Code will likely extend this file significantly. """ from __future__ import annotations import json from dataclasses import dataclass from datetime import date from typing import Dict, Iterable, List, Optional, Tuple import numpy as np import pandas as pd # Raster / geo import rasterio from rasterio.enums import Resampling # ========================================== # Training helpers # ========================================== def drop_junk_columns(df: pd.DataFrame, junk_cols: List[str]) -> pd.DataFrame: cols_to_drop = [c for c in junk_cols if c in df.columns] return df.drop(columns=cols_to_drop) def scout_feature_selection( X_train: pd.DataFrame, y_train: np.ndarray, n_estimators: int = 100, random_state: int = 42, ) -> List[str]: """Scout LightGBM feature selection (keeps non-zero importances).""" import lightgbm as lgb lgbm = lgb.LGBMClassifier(n_estimators=n_estimators, random_state=random_state, verbose=-1) lgbm.fit(X_train, y_train) importances = pd.DataFrame( {"Feature": X_train.columns, "Importance": lgbm.feature_importances_} ).sort_values("Importance", ascending=False) selected = importances[importances["Importance"] > 0]["Feature"].tolist() if not selected: # Fallback: keep everything (better than breaking training) selected = list(X_train.columns) return selected def scale_numeric_features( X_train: pd.DataFrame, X_test: pd.DataFrame, ): """Scale only numeric columns, return (X_train_scaled, X_test_scaled, scaler).""" from sklearn.preprocessing import StandardScaler scaler = StandardScaler() num_cols = X_train.select_dtypes(include=[np.number]).columns X_train_scaled = X_train.copy() X_test_scaled = X_test.copy() X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols]) X_test_scaled[num_cols] = scaler.transform(X_test[num_cols]) return X_train_scaled, X_test_scaled, scaler # ========================================== # Inference helpers # ========================================== # AOI tuple: (lon, lat, radius_m) AOI = Tuple[float, float, float] def validate_aoi_zimbabwe(aoi: AOI, max_radius_m: float = 5000.0): """Basic AOI validation. - Ensures radius <= max_radius_m - Ensures AOI center is within rough Zimbabwe bounds. NOTE: For production, use a real Zimbabwe polygon and check circle intersects. You can load a simplified boundary GeoJSON and use shapely. """ lon, lat, radius_m = aoi if radius_m <= 0 or radius_m > max_radius_m: raise ValueError(f"radius_m must be in (0, {max_radius_m}]") # Rough bbox for Zimbabwe (good cheap pre-check). # Lon: 25.2 to 33.1, Lat: -22.5 to -15.6 if not (25.2 <= lon <= 33.1 and -22.5 <= lat <= -15.6): raise ValueError("AOI must be within Zimbabwe") def clip_raster_to_aoi( src_path: str, aoi: AOI, dst_profile_like: Optional[dict] = None, ) -> Tuple[np.ndarray, dict]: """Clip a raster to AOI circle. Template implementation: reads a window around the circle's bbox. For exact circle mask, add a mask step after reading. """ lon, lat, radius_m = aoi with rasterio.open(src_path) as src: # Approx bbox from radius using rough degrees conversion. # Production: use pyproj geodesic buffer. deg = radius_m / 111_320.0 minx, maxx = lon - deg, lon + deg miny, maxy = lat - deg, lat + deg window = rasterio.windows.from_bounds(minx, miny, maxx, maxy, transform=src.transform) window = window.round_offsets().round_lengths() arr = src.read(1, window=window) profile = src.profile.copy() # Update transform for the window profile.update( { "height": arr.shape[0], "width": arr.shape[1], "transform": rasterio.windows.transform(window, src.transform), } ) # Optional: resample/align to dst_profile_like if dst_profile_like is not None: arr, profile = _resample_to_profile(arr, profile, dst_profile_like) return arr, profile def _resample_to_profile(arr: np.ndarray, src_profile: dict, dst_profile: dict) -> Tuple[np.ndarray, dict]: """Nearest-neighbor resample to match dst grid.""" dst_h = dst_profile["height"] dst_w = dst_profile["width"] dst_arr = np.empty((dst_h, dst_w), dtype=arr.dtype) with rasterio.io.MemoryFile() as mem: with mem.open(**src_profile) as src: src.write(arr, 1) rasterio.warp.reproject( source=rasterio.band(src, 1), destination=dst_arr, src_transform=src_profile["transform"], src_crs=src_profile["crs"], dst_transform=dst_profile["transform"], dst_crs=dst_profile["crs"], resampling=Resampling.nearest, ) prof = dst_profile.copy() prof.update({"count": 1, "dtype": str(dst_arr.dtype)}) return dst_arr, prof def load_dw_baseline_window(cfg, year: int, season: str, aoi: AOI) -> Tuple[np.ndarray, dict]: """Loads the DW baseline seasonal COG from MinIO and clips to AOI. The cfg.storage implementation decides whether to stream or download locally. Expected naming convention: dw_{season}_{year}.tif OR DW_Zim_HighestConf_2015_2016.tif You can implement a mapping in cfg.dw_key_for(year, season). """ local_path = cfg.storage.get_dw_local_path(year=year, season=season) arr, profile = clip_raster_to_aoi(local_path, aoi) # Ensure a single band profile profile.update({"count": 1}) if "dtype" not in profile: profile["dtype"] = str(arr.dtype) return arr, profile # ------------------------- # DEA STAC feature stack # ------------------------- def build_feature_stack_from_dea( cfg, aoi: AOI, start_date: str, end_date: str, target_profile: dict, ) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]: """Query DEA STAC and compute a per-pixel feature cube. Returns: feat_arr: (H, W, C) feat_profile: raster profile aligned to target_profile feat_names: list[str] aux_layers: dict for extra outputs (true_color, ndvi, evi, savi) Implementation strategy (recommended): - Use pystac-client + stackstac or odc-stac to load xarray - Reproject/resample to target grid (10m) - Compute composites (e.g., median or best-pixel) - Compute indices For now this is a stub returning zeros so the pipeline wiring works. """ H = target_profile["height"] W = target_profile["width"] # Placeholder features — Roo Code will replace with real DEA loading. feat_names = ["ndvi_peak", "evi_peak", "savi_peak"] feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32) aux_layers = { "true_color": np.zeros((H, W, 3), dtype=np.uint16), "ndvi_peak": np.zeros((H, W), dtype=np.float32), "evi_peak": np.zeros((H, W), dtype=np.float32), "savi_peak": np.zeros((H, W), dtype=np.float32), } feat_profile = target_profile.copy() feat_profile.update({"count": 1, "dtype": "float32"}) return feat_arr, feat_profile, feat_names, aux_layers # ------------------------- # Neighborhood smoothing # ------------------------- def majority_filter(arr: np.ndarray, k: int = 3) -> np.ndarray: """Majority filter for 2D class label arrays. arr may be dtype string (labels) or integers. For strings, we use a slower path with unique counts. k must be odd (3,5,7). NOTE: This is a simple CPU implementation. For speed: - convert labels to ints - use scipy.ndimage or numba - or apply with rasterio/gdal focal statistics """ if k % 2 == 0 or k < 3: raise ValueError("k must be odd and >= 3") pad = k // 2 H, W = arr.shape padded = np.pad(arr, ((pad, pad), (pad, pad)), mode="edge") out = arr.copy() # If numeric, use bincount fast path if np.issubdtype(arr.dtype, np.integer): maxv = int(arr.max()) if arr.size else 0 for y in range(H): for x in range(W): win = padded[y : y + k, x : x + k].ravel() counts = np.bincount(win, minlength=maxv + 1) out[y, x] = counts.argmax() return out # String/obj path for y in range(H): for x in range(W): win = padded[y : y + k, x : x + k].ravel() vals, counts = np.unique(win, return_counts=True) out[y, x] = vals[counts.argmax()] return out