geocrop-platform./training/features.py

281 lines
8.7 KiB
Python

"""Feature engineering + geospatial helpers for GeoCrop.
This module is shared by training (feature selection + scaling helpers)
AND inference (DEA STAC fetch + raster alignment + smoothing).
Roo Code will likely extend this file significantly.
"""
from __future__ import annotations
import json
from dataclasses import dataclass
from datetime import date
from typing import Dict, Iterable, List, Optional, Tuple
import numpy as np
import pandas as pd
# Raster / geo
import rasterio
from rasterio.enums import Resampling
# ==========================================
# Training helpers
# ==========================================
def drop_junk_columns(df: pd.DataFrame, junk_cols: List[str]) -> pd.DataFrame:
cols_to_drop = [c for c in junk_cols if c in df.columns]
return df.drop(columns=cols_to_drop)
def scout_feature_selection(
X_train: pd.DataFrame,
y_train: np.ndarray,
n_estimators: int = 100,
random_state: int = 42,
) -> List[str]:
"""Scout LightGBM feature selection (keeps non-zero importances)."""
import lightgbm as lgb
lgbm = lgb.LGBMClassifier(n_estimators=n_estimators, random_state=random_state, verbose=-1)
lgbm.fit(X_train, y_train)
importances = pd.DataFrame(
{"Feature": X_train.columns, "Importance": lgbm.feature_importances_}
).sort_values("Importance", ascending=False)
selected = importances[importances["Importance"] > 0]["Feature"].tolist()
if not selected:
# Fallback: keep everything (better than breaking training)
selected = list(X_train.columns)
return selected
def scale_numeric_features(
X_train: pd.DataFrame,
X_test: pd.DataFrame,
):
"""Scale only numeric columns, return (X_train_scaled, X_test_scaled, scaler)."""
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_cols = X_train.select_dtypes(include=[np.number]).columns
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])
return X_train_scaled, X_test_scaled, scaler
# ==========================================
# Inference helpers
# ==========================================
# AOI tuple: (lon, lat, radius_m)
AOI = Tuple[float, float, float]
def validate_aoi_zimbabwe(aoi: AOI, max_radius_m: float = 5000.0):
"""Basic AOI validation.
- Ensures radius <= max_radius_m
- Ensures AOI center is within rough Zimbabwe bounds.
NOTE: For production, use a real Zimbabwe polygon and check circle intersects.
You can load a simplified boundary GeoJSON and use shapely.
"""
lon, lat, radius_m = aoi
if radius_m <= 0 or radius_m > max_radius_m:
raise ValueError(f"radius_m must be in (0, {max_radius_m}]")
# Rough bbox for Zimbabwe (good cheap pre-check).
# Lon: 25.2 to 33.1, Lat: -22.5 to -15.6
if not (25.2 <= lon <= 33.1 and -22.5 <= lat <= -15.6):
raise ValueError("AOI must be within Zimbabwe")
def clip_raster_to_aoi(
src_path: str,
aoi: AOI,
dst_profile_like: Optional[dict] = None,
) -> Tuple[np.ndarray, dict]:
"""Clip a raster to AOI circle.
Template implementation: reads a window around the circle's bbox.
For exact circle mask, add a mask step after reading.
"""
lon, lat, radius_m = aoi
with rasterio.open(src_path) as src:
# Approx bbox from radius using rough degrees conversion.
# Production: use pyproj geodesic buffer.
deg = radius_m / 111_320.0
minx, maxx = lon - deg, lon + deg
miny, maxy = lat - deg, lat + deg
window = rasterio.windows.from_bounds(minx, miny, maxx, maxy, transform=src.transform)
window = window.round_offsets().round_lengths()
arr = src.read(1, window=window)
profile = src.profile.copy()
# Update transform for the window
profile.update(
{
"height": arr.shape[0],
"width": arr.shape[1],
"transform": rasterio.windows.transform(window, src.transform),
}
)
# Optional: resample/align to dst_profile_like
if dst_profile_like is not None:
arr, profile = _resample_to_profile(arr, profile, dst_profile_like)
return arr, profile
def _resample_to_profile(arr: np.ndarray, src_profile: dict, dst_profile: dict) -> Tuple[np.ndarray, dict]:
"""Nearest-neighbor resample to match dst grid."""
dst_h = dst_profile["height"]
dst_w = dst_profile["width"]
dst_arr = np.empty((dst_h, dst_w), dtype=arr.dtype)
with rasterio.io.MemoryFile() as mem:
with mem.open(**src_profile) as src:
src.write(arr, 1)
rasterio.warp.reproject(
source=rasterio.band(src, 1),
destination=dst_arr,
src_transform=src_profile["transform"],
src_crs=src_profile["crs"],
dst_transform=dst_profile["transform"],
dst_crs=dst_profile["crs"],
resampling=Resampling.nearest,
)
prof = dst_profile.copy()
prof.update({"count": 1, "dtype": str(dst_arr.dtype)})
return dst_arr, prof
def load_dw_baseline_window(cfg, year: int, season: str, aoi: AOI) -> Tuple[np.ndarray, dict]:
"""Loads the DW baseline seasonal COG from MinIO and clips to AOI.
The cfg.storage implementation decides whether to stream or download locally.
Expected naming convention:
dw_{season}_{year}.tif OR DW_Zim_HighestConf_2015_2016.tif
You can implement a mapping in cfg.dw_key_for(year, season).
"""
local_path = cfg.storage.get_dw_local_path(year=year, season=season)
arr, profile = clip_raster_to_aoi(local_path, aoi)
# Ensure a single band profile
profile.update({"count": 1})
if "dtype" not in profile:
profile["dtype"] = str(arr.dtype)
return arr, profile
# -------------------------
# DEA STAC feature stack
# -------------------------
def build_feature_stack_from_dea(
cfg,
aoi: AOI,
start_date: str,
end_date: str,
target_profile: dict,
) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]:
"""Query DEA STAC and compute a per-pixel feature cube.
Returns:
feat_arr: (H, W, C)
feat_profile: raster profile aligned to target_profile
feat_names: list[str]
aux_layers: dict for extra outputs (true_color, ndvi, evi, savi)
Implementation strategy (recommended):
- Use pystac-client + stackstac or odc-stac to load xarray
- Reproject/resample to target grid (10m)
- Compute composites (e.g., median or best-pixel)
- Compute indices
For now this is a stub returning zeros so the pipeline wiring works.
"""
H = target_profile["height"]
W = target_profile["width"]
# Placeholder features — Roo Code will replace with real DEA loading.
feat_names = ["ndvi_peak", "evi_peak", "savi_peak"]
feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32)
aux_layers = {
"true_color": np.zeros((H, W, 3), dtype=np.uint16),
"ndvi_peak": np.zeros((H, W), dtype=np.float32),
"evi_peak": np.zeros((H, W), dtype=np.float32),
"savi_peak": np.zeros((H, W), dtype=np.float32),
}
feat_profile = target_profile.copy()
feat_profile.update({"count": 1, "dtype": "float32"})
return feat_arr, feat_profile, feat_names, aux_layers
# -------------------------
# Neighborhood smoothing
# -------------------------
def majority_filter(arr: np.ndarray, k: int = 3) -> np.ndarray:
"""Majority filter for 2D class label arrays.
arr may be dtype string (labels) or integers. For strings, we use a slower
path with unique counts.
k must be odd (3,5,7).
NOTE: This is a simple CPU implementation. For speed:
- convert labels to ints
- use scipy.ndimage or numba
- or apply with rasterio/gdal focal statistics
"""
if k % 2 == 0 or k < 3:
raise ValueError("k must be odd and >= 3")
pad = k // 2
H, W = arr.shape
padded = np.pad(arr, ((pad, pad), (pad, pad)), mode="edge")
out = arr.copy()
# If numeric, use bincount fast path
if np.issubdtype(arr.dtype, np.integer):
maxv = int(arr.max()) if arr.size else 0
for y in range(H):
for x in range(W):
win = padded[y : y + k, x : x + k].ravel()
counts = np.bincount(win, minlength=maxv + 1)
out[y, x] = counts.argmax()
return out
# String/obj path
for y in range(H):
for x in range(W):
win = padded[y : y + k, x : x + k].ravel()
vals, counts = np.unique(win, return_counts=True)
out[y, x] = vals[counts.argmax()]
return out