281 lines
8.7 KiB
Python
281 lines
8.7 KiB
Python
"""Feature engineering + geospatial helpers for GeoCrop.
|
|
|
|
This module is shared by training (feature selection + scaling helpers)
|
|
AND inference (DEA STAC fetch + raster alignment + smoothing).
|
|
|
|
Roo Code will likely extend this file significantly.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from dataclasses import dataclass
|
|
from datetime import date
|
|
from typing import Dict, Iterable, List, Optional, Tuple
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
# Raster / geo
|
|
import rasterio
|
|
from rasterio.enums import Resampling
|
|
|
|
|
|
# ==========================================
|
|
# Training helpers
|
|
# ==========================================
|
|
|
|
def drop_junk_columns(df: pd.DataFrame, junk_cols: List[str]) -> pd.DataFrame:
|
|
cols_to_drop = [c for c in junk_cols if c in df.columns]
|
|
return df.drop(columns=cols_to_drop)
|
|
|
|
|
|
def scout_feature_selection(
|
|
X_train: pd.DataFrame,
|
|
y_train: np.ndarray,
|
|
n_estimators: int = 100,
|
|
random_state: int = 42,
|
|
) -> List[str]:
|
|
"""Scout LightGBM feature selection (keeps non-zero importances)."""
|
|
import lightgbm as lgb
|
|
|
|
lgbm = lgb.LGBMClassifier(n_estimators=n_estimators, random_state=random_state, verbose=-1)
|
|
lgbm.fit(X_train, y_train)
|
|
|
|
importances = pd.DataFrame(
|
|
{"Feature": X_train.columns, "Importance": lgbm.feature_importances_}
|
|
).sort_values("Importance", ascending=False)
|
|
|
|
selected = importances[importances["Importance"] > 0]["Feature"].tolist()
|
|
if not selected:
|
|
# Fallback: keep everything (better than breaking training)
|
|
selected = list(X_train.columns)
|
|
return selected
|
|
|
|
|
|
def scale_numeric_features(
|
|
X_train: pd.DataFrame,
|
|
X_test: pd.DataFrame,
|
|
):
|
|
"""Scale only numeric columns, return (X_train_scaled, X_test_scaled, scaler)."""
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
scaler = StandardScaler()
|
|
|
|
num_cols = X_train.select_dtypes(include=[np.number]).columns
|
|
X_train_scaled = X_train.copy()
|
|
X_test_scaled = X_test.copy()
|
|
|
|
X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
|
|
X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])
|
|
|
|
return X_train_scaled, X_test_scaled, scaler
|
|
|
|
|
|
# ==========================================
|
|
# Inference helpers
|
|
# ==========================================
|
|
|
|
# AOI tuple: (lon, lat, radius_m)
|
|
AOI = Tuple[float, float, float]
|
|
|
|
|
|
def validate_aoi_zimbabwe(aoi: AOI, max_radius_m: float = 5000.0):
|
|
"""Basic AOI validation.
|
|
|
|
- Ensures radius <= max_radius_m
|
|
- Ensures AOI center is within rough Zimbabwe bounds.
|
|
|
|
NOTE: For production, use a real Zimbabwe polygon and check circle intersects.
|
|
You can load a simplified boundary GeoJSON and use shapely.
|
|
"""
|
|
lon, lat, radius_m = aoi
|
|
if radius_m <= 0 or radius_m > max_radius_m:
|
|
raise ValueError(f"radius_m must be in (0, {max_radius_m}]")
|
|
|
|
# Rough bbox for Zimbabwe (good cheap pre-check).
|
|
# Lon: 25.2 to 33.1, Lat: -22.5 to -15.6
|
|
if not (25.2 <= lon <= 33.1 and -22.5 <= lat <= -15.6):
|
|
raise ValueError("AOI must be within Zimbabwe")
|
|
|
|
|
|
def clip_raster_to_aoi(
|
|
src_path: str,
|
|
aoi: AOI,
|
|
dst_profile_like: Optional[dict] = None,
|
|
) -> Tuple[np.ndarray, dict]:
|
|
"""Clip a raster to AOI circle.
|
|
|
|
Template implementation: reads a window around the circle's bbox.
|
|
|
|
For exact circle mask, add a mask step after reading.
|
|
"""
|
|
lon, lat, radius_m = aoi
|
|
|
|
with rasterio.open(src_path) as src:
|
|
# Approx bbox from radius using rough degrees conversion.
|
|
# Production: use pyproj geodesic buffer.
|
|
deg = radius_m / 111_320.0
|
|
minx, maxx = lon - deg, lon + deg
|
|
miny, maxy = lat - deg, lat + deg
|
|
|
|
window = rasterio.windows.from_bounds(minx, miny, maxx, maxy, transform=src.transform)
|
|
window = window.round_offsets().round_lengths()
|
|
|
|
arr = src.read(1, window=window)
|
|
profile = src.profile.copy()
|
|
|
|
# Update transform for the window
|
|
profile.update(
|
|
{
|
|
"height": arr.shape[0],
|
|
"width": arr.shape[1],
|
|
"transform": rasterio.windows.transform(window, src.transform),
|
|
}
|
|
)
|
|
|
|
# Optional: resample/align to dst_profile_like
|
|
if dst_profile_like is not None:
|
|
arr, profile = _resample_to_profile(arr, profile, dst_profile_like)
|
|
|
|
return arr, profile
|
|
|
|
|
|
def _resample_to_profile(arr: np.ndarray, src_profile: dict, dst_profile: dict) -> Tuple[np.ndarray, dict]:
|
|
"""Nearest-neighbor resample to match dst grid."""
|
|
dst_h = dst_profile["height"]
|
|
dst_w = dst_profile["width"]
|
|
|
|
dst_arr = np.empty((dst_h, dst_w), dtype=arr.dtype)
|
|
with rasterio.io.MemoryFile() as mem:
|
|
with mem.open(**src_profile) as src:
|
|
src.write(arr, 1)
|
|
rasterio.warp.reproject(
|
|
source=rasterio.band(src, 1),
|
|
destination=dst_arr,
|
|
src_transform=src_profile["transform"],
|
|
src_crs=src_profile["crs"],
|
|
dst_transform=dst_profile["transform"],
|
|
dst_crs=dst_profile["crs"],
|
|
resampling=Resampling.nearest,
|
|
)
|
|
|
|
prof = dst_profile.copy()
|
|
prof.update({"count": 1, "dtype": str(dst_arr.dtype)})
|
|
return dst_arr, prof
|
|
|
|
|
|
def load_dw_baseline_window(cfg, year: int, season: str, aoi: AOI) -> Tuple[np.ndarray, dict]:
|
|
"""Loads the DW baseline seasonal COG from MinIO and clips to AOI.
|
|
|
|
The cfg.storage implementation decides whether to stream or download locally.
|
|
|
|
Expected naming convention:
|
|
dw_{season}_{year}.tif OR DW_Zim_HighestConf_2015_2016.tif
|
|
|
|
You can implement a mapping in cfg.dw_key_for(year, season).
|
|
"""
|
|
local_path = cfg.storage.get_dw_local_path(year=year, season=season)
|
|
arr, profile = clip_raster_to_aoi(local_path, aoi)
|
|
|
|
# Ensure a single band profile
|
|
profile.update({"count": 1})
|
|
if "dtype" not in profile:
|
|
profile["dtype"] = str(arr.dtype)
|
|
|
|
return arr, profile
|
|
|
|
|
|
# -------------------------
|
|
# DEA STAC feature stack
|
|
# -------------------------
|
|
|
|
def build_feature_stack_from_dea(
|
|
cfg,
|
|
aoi: AOI,
|
|
start_date: str,
|
|
end_date: str,
|
|
target_profile: dict,
|
|
) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]:
|
|
"""Query DEA STAC and compute a per-pixel feature cube.
|
|
|
|
Returns:
|
|
feat_arr: (H, W, C)
|
|
feat_profile: raster profile aligned to target_profile
|
|
feat_names: list[str]
|
|
aux_layers: dict for extra outputs (true_color, ndvi, evi, savi)
|
|
|
|
Implementation strategy (recommended):
|
|
- Use pystac-client + stackstac or odc-stac to load xarray
|
|
- Reproject/resample to target grid (10m)
|
|
- Compute composites (e.g., median or best-pixel)
|
|
- Compute indices
|
|
|
|
For now this is a stub returning zeros so the pipeline wiring works.
|
|
"""
|
|
|
|
H = target_profile["height"]
|
|
W = target_profile["width"]
|
|
|
|
# Placeholder features — Roo Code will replace with real DEA loading.
|
|
feat_names = ["ndvi_peak", "evi_peak", "savi_peak"]
|
|
feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32)
|
|
|
|
aux_layers = {
|
|
"true_color": np.zeros((H, W, 3), dtype=np.uint16),
|
|
"ndvi_peak": np.zeros((H, W), dtype=np.float32),
|
|
"evi_peak": np.zeros((H, W), dtype=np.float32),
|
|
"savi_peak": np.zeros((H, W), dtype=np.float32),
|
|
}
|
|
|
|
feat_profile = target_profile.copy()
|
|
feat_profile.update({"count": 1, "dtype": "float32"})
|
|
|
|
return feat_arr, feat_profile, feat_names, aux_layers
|
|
|
|
|
|
# -------------------------
|
|
# Neighborhood smoothing
|
|
# -------------------------
|
|
|
|
def majority_filter(arr: np.ndarray, k: int = 3) -> np.ndarray:
|
|
"""Majority filter for 2D class label arrays.
|
|
|
|
arr may be dtype string (labels) or integers. For strings, we use a slower
|
|
path with unique counts.
|
|
|
|
k must be odd (3,5,7).
|
|
|
|
NOTE: This is a simple CPU implementation. For speed:
|
|
- convert labels to ints
|
|
- use scipy.ndimage or numba
|
|
- or apply with rasterio/gdal focal statistics
|
|
"""
|
|
if k % 2 == 0 or k < 3:
|
|
raise ValueError("k must be odd and >= 3")
|
|
|
|
pad = k // 2
|
|
H, W = arr.shape
|
|
padded = np.pad(arr, ((pad, pad), (pad, pad)), mode="edge")
|
|
|
|
out = arr.copy()
|
|
|
|
# If numeric, use bincount fast path
|
|
if np.issubdtype(arr.dtype, np.integer):
|
|
maxv = int(arr.max()) if arr.size else 0
|
|
for y in range(H):
|
|
for x in range(W):
|
|
win = padded[y : y + k, x : x + k].ravel()
|
|
counts = np.bincount(win, minlength=maxv + 1)
|
|
out[y, x] = counts.argmax()
|
|
return out
|
|
|
|
# String/obj path
|
|
for y in range(H):
|
|
for x in range(W):
|
|
win = padded[y : y + k, x : x + k].ravel()
|
|
vals, counts = np.unique(win, return_counts=True)
|
|
out[y, x] = vals[counts.argmax()]
|
|
|
|
return out
|