"""Feature engineering + geospatial helpers for GeoCrop. This module is shared by training (feature selection + scaling helpers) AND inference (DEA STAC fetch + raster alignment + smoothing). IMPORTANT: This implementation exactly replicates train.py feature engineering: - Savitzky-Golay smoothing (window=5, polyorder=2) with 0-interpolation - Phenology metrics (amplitude, AUC, peak_timestep, max_slope) - Harmonic/Fourier features (1st and 2nd order sin/cos) - Seasonal window statistics (Early: Oct-Dec, Peak: Jan-Mar, Late: Apr-Jun) """ from __future__ import annotations import json import re from dataclasses import dataclass from datetime import date from typing import Dict, Iterable, List, Optional, Tuple import numpy as np import pandas as pd # Raster / geo import rasterio from rasterio.enums import Resampling # ========================================== # Training helpers # ========================================== def drop_junk_columns(df: pd.DataFrame, junk_cols: List[str]) -> pd.DataFrame: """Drop junk/spatial columns that would cause data leakage. Matches train.py junk_cols: ['.geo', 'system:index', 'latitude', 'longitude', 'lat', 'lon', 'ID', 'parent_id', 'batch_id', 'is_syn'] """ cols_to_drop = [c for c in junk_cols if c in df.columns] return df.drop(columns=cols_to_drop) def scout_feature_selection( X_train: pd.DataFrame, y_train: np.ndarray, n_estimators: int = 100, random_state: int = 42, ) -> List[str]: """Scout LightGBM feature selection (keeps non-zero importances).""" import lightgbm as lgb lgbm = lgb.LGBMClassifier(n_estimators=n_estimators, random_state=random_state, verbose=-1) lgbm.fit(X_train, y_train) importances = pd.DataFrame( {"Feature": X_train.columns, "Importance": lgbm.feature_importances_} ).sort_values("Importance", ascending=False) selected = importances[importances["Importance"] > 0]["Feature"].tolist() if not selected: # Fallback: keep everything (better than breaking training) selected = list(X_train.columns) return selected def scale_numeric_features( X_train: pd.DataFrame, X_test: pd.DataFrame, ): """Scale only numeric columns, return (X_train_scaled, X_test_scaled, scaler). Uses StandardScaler (matches train.py). """ from sklearn.preprocessing import StandardScaler scaler = StandardScaler() num_cols = X_train.select_dtypes(include=[np.number]).columns X_train_scaled = X_train.copy() X_test_scaled = X_test.copy() X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols]) X_test_scaled[num_cols] = scaler.transform(X_test[num_cols]) return X_train_scaled, X_test_scaled, scaler # ========================================== # INFERENCE-ONLY FEATURE ENGINEERING # These functions replicate train.py for raster-based inference # ========================================== def apply_smoothing_to_rasters( timeseries_dict: Dict[str, np.ndarray], dates: List[str] ) -> Dict[str, np.ndarray]: """Apply Savitzky-Golay smoothing to time-series raster arrays. Replicates train.py apply_smoothing(): 1. Replace 0 with NaN 2. Linear interpolate across time axis, fillna(0) 3. Savitzky-Golay: window_length=5, polyorder=2 Args: timeseries_dict: Dict mapping index name to (H, W, T) array dates: List of date strings in YYYYMMDD format Returns: Dict mapping index name to smoothed (H, W, T) array """ from scipy.signal import savgol_filter smoothed = {} n_times = len(dates) for idx_name, arr in timeseries_dict.items(): # arr shape: (H, W, T) H, W, T = arr.shape # Reshape to (H*W, T) for vectorized processing arr_2d = arr.reshape(-1, T) # 1. Replace 0 with NaN arr_2d = np.where(arr_2d == 0, np.nan, arr_2d) # 2. Linear interpolate across time axis (axis=1) # Handle each row (each pixel) independently interp_rows = [] for row in arr_2d: # Use pandas Series for linear interpolation ser = pd.Series(row) ser = ser.interpolate(method='linear', limit_direction='both') interp_rows.append(ser.fillna(0).values) interp_arr = np.array(interp_rows) # 3. Apply Savitzky-Golay smoothing # window_length=5, polyorder=2 smooth_arr = savgol_filter(interp_arr, window_length=5, polyorder=2, axis=1) # Reshape back to (H, W, T) smoothed[idx_name] = smooth_arr.reshape(H, W, T) return smoothed def extract_phenology_from_rasters( timeseries_dict: Dict[str, np.ndarray], dates: List[str], indices: List[str] = ['ndvi', 'ndre', 'evi'] ) -> Dict[str, np.ndarray]: """Extract phenology metrics from time-series raster arrays. Replicates train.py extract_phenology(): - Magnitude: max, min, mean, std, amplitude - AUC: trapezoid integral with dx=10 - Timing: peak_timestep (argmax) - Slopes: max_slope_up, max_slope_down Args: timeseries_dict: Dict mapping index name to (H, W, T) array (should be smoothed) dates: List of date strings indices: Which indices to process Returns: Dict mapping feature name to (H, W) array """ from scipy.integrate import trapezoid features = {} for idx in indices: if idx not in timeseries_dict: continue arr = timeseries_dict[idx] # (H, W, T) H, W, T = arr.shape # Reshape to (H*W, T) for vectorized processing arr_2d = arr.reshape(-1, T) # Magnitude Metrics features[f'{idx}_max'] = np.max(arr_2d, axis=1).reshape(H, W) features[f'{idx}_min'] = np.min(arr_2d, axis=1).reshape(H, W) features[f'{idx}_mean'] = np.mean(arr_2d, axis=1).reshape(H, W) features[f'{idx}_std'] = np.std(arr_2d, axis=1).reshape(H, W) features[f'{idx}_amplitude'] = features[f'{idx}_max'] - features[f'{idx}_min'] # AUC (Area Under Curve) with dx=10 (10-day intervals) features[f'{idx}_auc'] = trapezoid(arr_2d, dx=10, axis=1).reshape(H, W) # Peak timestep (timing) peak_indices = np.argmax(arr_2d, axis=1) features[f'{idx}_peak_timestep'] = peak_indices.reshape(H, W) # Slopes (rates of change) slopes = np.diff(arr_2d, axis=1) # (H*W, T-1) features[f'{idx}_max_slope_up'] = np.max(slopes, axis=1).reshape(H, W) features[f'{idx}_max_slope_down'] = np.min(slopes, axis=1).reshape(H, W) return features def add_harmonics_to_rasters( timeseries_dict: Dict[str, np.ndarray], dates: List[str], indices: List[str] = ['ndvi'] ) -> Dict[str, np.ndarray]: """Add harmonic/fourier features from time-series raster arrays. Replicates train.py add_harmonics(): - 1st order: sin(t), cos(t) - 2nd order: sin(2t), cos(2t) where t = 2*pi * time_step / n_times Args: timeseries_dict: Dict mapping index name to (H, W, T) array (should be smoothed) dates: List of date strings indices: Which indices to process Returns: Dict mapping feature name to (H, W) array """ features = {} n_times = len(dates) # Normalize time to 0-2pi (one full cycle) time_steps = np.arange(n_times) t = 2 * np.pi * time_steps / n_times sin_t = np.sin(t) cos_t = np.cos(t) sin_2t = np.sin(2 * t) cos_2t = np.cos(2 * t) for idx in indices: if idx not in timeseries_dict: continue arr = timeseries_dict[idx] # (H, W, T) H, W, T = arr.shape # Reshape to (H*W, T) for vectorized processing arr_2d = arr.reshape(-1, T) # Normalized dot products (harmonic coefficients) features[f'{idx}_harmonic1_sin'] = np.dot(arr_2d, sin_t) / n_times features[f'{idx}_harmonic1_cos'] = np.dot(arr_2d, cos_t) / n_times features[f'{idx}_harmonic2_sin'] = np.dot(arr_2d, sin_2t) / n_times features[f'{idx}_harmonic2_cos'] = np.dot(arr_2d, cos_2t) / n_times # Reshape back to (H, W) for feat_name in [f'{idx}_harmonic1_sin', f'{idx}_harmonic1_cos', f'{idx}_harmonic2_sin', f'{idx}_harmonic2_cos']: features[feat_name] = features[feat_name].reshape(H, W) return features def add_seasonal_windows_and_interactions( timeseries_dict: Dict[str, np.ndarray], dates: List[str], indices: List[str] = ['ndvi', 'ndwi', 'ndre'], phenology_features: Dict[str, np.ndarray] = None ) -> Dict[str, np.ndarray]: """Add seasonal window statistics and index interactions. Replicates train.py add_interactions_and_windows(): - Seasonal windows (Zimbabwe season: Oct-Jun): - Early: Oct-Dec (months 10, 11, 12) - Peak: Jan-Mar (months 1, 2, 3) - Late: Apr-Jun (months 4, 5, 6) - Interactions: - ndvi_ndre_peak_diff = ndvi_max - ndre_max - canopy_density_contrast = evi_mean / (ndvi_mean + 0.001) Args: timeseries_dict: Dict mapping index name to (H, W, T) array dates: List of date strings in YYYYMMDD format indices: Which indices to process phenology_features: Dict of phenology features for interactions Returns: Dict mapping feature name to (H, W) array """ features = {} # Parse dates to identify months dt_dates = pd.to_datetime(dates, format='%Y%m%d') # Define seasonal windows (months) windows = { 'early': [10, 11, 12], # Oct-Dec 'peak': [1, 2, 3], # Jan-Mar 'late': [4, 5, 6] # Apr-Jun } for idx in indices: if idx not in timeseries_dict: continue arr = timeseries_dict[idx] # (H, W, T) H, W, T = arr.shape for win_name, months in windows.items(): # Find time indices belonging to this window month_mask = np.array([d.month in months for d in dt_dates]) if not np.any(month_mask): continue # Extract window slice window_arr = arr[:, :, month_mask] # (H, W, T_window) # Compute statistics window_2d = window_arr.reshape(-1, window_arr.shape[2]) features[f'{idx}_{win_name}_mean'] = np.mean(window_2d, axis=1).reshape(H, W) features[f'{idx}_{win_name}_max'] = np.max(window_2d, axis=1).reshape(H, W) # Add interactions (if phenology features available) if phenology_features is not None: # ndvi_ndre_peak_diff if 'ndvi_max' in phenology_features and 'ndre_max' in phenology_features: features['ndvi_ndre_peak_diff'] = ( phenology_features['ndvi_max'] - phenology_features['ndre_max'] ) # canopy_density_contrast if 'evi_mean' in phenology_features and 'ndvi_mean' in phenology_features: features['canopy_density_contrast'] = ( phenology_features['evi_mean'] / (phenology_features['ndvi_mean'] + 0.001) ) return features # ========================================== # Inference helpers # ========================================== # AOI tuple: (lon, lat, radius_m) AOI = Tuple[float, float, float] def validate_aoi_zimbabwe(aoi: AOI, max_radius_m: float = 5000.0): """Basic AOI validation. - Ensures radius <= max_radius_m - Ensures AOI center is within rough Zimbabwe bounds. NOTE: For production, use a real Zimbabwe polygon and check circle intersects. You can load a simplified boundary GeoJSON and use shapely. """ lon, lat, radius_m = aoi if radius_m <= 0 or radius_m > max_radius_m: raise ValueError(f"radius_m must be in (0, {max_radius_m}]") # Rough bbox for Zimbabwe (good cheap pre-check). # Lon: 25.2 to 33.1, Lat: -22.5 to -15.6 if not (25.2 <= lon <= 33.1 and -22.5 <= lat <= -15.6): raise ValueError("AOI must be within Zimbabwe") def clip_raster_to_aoi( src_path: str, aoi: AOI, dst_profile_like: Optional[dict] = None, ) -> Tuple[np.ndarray, dict]: """Clip a raster to AOI circle. Template implementation: reads a window around the circle's bbox. For exact circle mask, add a mask step after reading. """ lon, lat, radius_m = aoi with rasterio.open(src_path) as src: # Approx bbox from radius using rough degrees conversion. # Production: use pyproj geodesic buffer. deg = radius_m / 111_320.0 minx, maxx = lon - deg, lon + deg miny, maxy = lat - deg, lat + deg window = rasterio.windows.from_bounds(minx, miny, maxx, maxy, transform=src.transform) window = window.round_offsets().round_lengths() arr = src.read(1, window=window) profile = src.profile.copy() # Update transform for the window profile.update( { "height": arr.shape[0], "width": arr.shape[1], "transform": rasterio.windows.transform(window, src.transform), } ) # Optional: resample/align to dst_profile_like if dst_profile_like is not None: arr, profile = _resample_to_profile(arr, profile, dst_profile_like) return arr, profile def _resample_to_profile(arr: np.ndarray, src_profile: dict, dst_profile: dict) -> Tuple[np.ndarray, dict]: """Nearest-neighbor resample to match dst grid.""" dst_h = dst_profile["height"] dst_w = dst_profile["width"] dst_arr = np.empty((dst_h, dst_w), dtype=arr.dtype) with rasterio.io.MemoryFile() as mem: with mem.open(**src_profile) as src: src.write(arr, 1) rasterio.warp.reproject( source=rasterio.band(src, 1), destination=dst_arr, src_transform=src_profile["transform"], src_crs=src_profile["crs"], dst_transform=dst_profile["transform"], dst_crs=dst_profile["crs"], resampling=Resampling.nearest, ) prof = dst_profile.copy() prof.update({"count": 1, "dtype": str(dst_arr.dtype)}) return dst_arr, prof def load_dw_baseline_window(cfg, year: int, season: str, aoi: AOI) -> Tuple[np.ndarray, dict]: """Loads the DW baseline seasonal COG from MinIO and clips to AOI. The cfg.storage implementation decides whether to stream or download locally. Expected naming convention: dw_{season}_{year}.tif OR DW_Zim_HighestConf_{year}_{year+1}.tif You can implement a mapping in cfg.dw_key_for(year, season). """ local_path = cfg.storage.get_dw_local_path(year=year, season=season) arr, profile = clip_raster_to_aoi(local_path, aoi) # Ensure a single band profile profile.update({"count": 1}) if "dtype" not in profile: profile["dtype"] = str(arr.dtype) return arr, profile # ------------------------- # DEA STAC feature stack # ------------------------- def compute_indices_from_bands( red: np.ndarray, nir: np.ndarray, blue: np.ndarray = None, green: np.ndarray = None, swir1: np.ndarray = None, swir2: np.ndarray = None ) -> Dict[str, np.ndarray]: """Compute vegetation indices from band arrays. Indices computed: - NDVI = (NIR - Red) / (NIR + Red) - EVI = 2.5 * (NIR - Red) / (NIR + 6*Red - 7.5*Blue + 1) - SAVI = ((NIR - Red) / (NIR + Red + L)) * (1 + L) where L=0.5 - NDRE = (NIR - RedEdge) / (NIR + RedEdge) - CI_RE = (NIR / RedEdge) - 1 - NDWI = (Green - NIR) / (Green + NIR) Args: red: Red band (B4) nir: NIR band (B8) blue: Blue band (B2, optional) green: Green band (B3, optional) swir1: SWIR1 band (B11, optional) swir2: SWIR2 band (B12, optional) Returns: Dict mapping index name to array """ indices = {} # Ensure float64 for precision nir = nir.astype(np.float64) red = red.astype(np.float64) # NDVI = (NIR - Red) / (NIR + Red) denominator = nir + red indices['ndvi'] = np.where(denominator != 0, (nir - red) / denominator, 0) # EVI = 2.5 * (NIR - Red) / (NIR + 6*Red - 7.5*Blue + 1) if blue is not None: blue = blue.astype(np.float64) evi_denom = nir + 6*red - 7.5*blue + 1 indices['evi'] = np.where(evi_denom != 0, 2.5 * (nir - red) / evi_denom, 0) # SAVI = ((NIR - Red) / (NIR + Red + L)) * (1 + L) where L=0.5 L = 0.5 savi_denom = nir + red + L indices['savi'] = np.where(savi_denom != 0, ((nir - red) / savi_denom) * (1 + L), 0) # NDRE = (NIR - RedEdge) / (NIR + RedEdge) # RedEdge is typically B5 (705nm) - use NIR if not available if 'rededge' in locals() and rededge is not None: rededge = rededge.astype(np.float64) ndre_denom = nir + rededge indices['ndre'] = np.where(ndre_denom != 0, (nir - rededge) / ndre_denom, 0) # CI_RE = (NIR / RedEdge) - 1 indices['ci_re'] = np.where(rededge != 0, (nir / rededge) - 1, 0) else: # Fallback: use SWIR1 as proxy for red-edge if available if swir1 is not None: swir1 = swir1.astype(np.float64) ndre_denom = nir + swir1 indices['ndre'] = np.where(ndre_denom != 0, (nir - swir1) / ndre_denom, 0) indices['ci_re'] = np.where(swir1 != 0, (nir / swir1) - 1, 0) # NDWI = (Green - NIR) / (Green + NIR) if green is not None: green = green.astype(np.float64) ndwi_denom = green + nir indices['ndwi'] = np.where(ndwi_denom != 0, (green - nir) / ndwi_denom, 0) return indices def build_feature_stack_from_dea( cfg, aoi: AOI, start_date: str, end_date: str, target_profile: dict, ) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]: """Query DEA STAC and compute a per-pixel feature cube. This function implements the FULL feature engineering pipeline matching train.py: 1. Load Sentinel-2 data from DEA STAC 2. Compute indices (ndvi, ndre, evi, savi, ci_re, ndwi) 3. Apply Savitzky-Golay smoothing with 0-interpolation 4. Extract phenology metrics (amplitude, AUC, peak, slope) 5. Add harmonic/fourier features 6. Add seasonal window statistics 7. Add index interactions Returns: feat_arr: (H, W, C) feat_profile: raster profile aligned to target_profile feat_names: list[str] aux_layers: dict for extra outputs (true_color, ndvi, evi, savi) """ # Import STAC dependencies try: import pystac_client import stackstac except ImportError: raise ImportError("pystac-client and stackstac are required for DEA STAC loading") from scipy.signal import savgol_filter from scipy.integrate import trapezoid H = target_profile["height"] W = target_profile["width"] # DEA STAC configuration stac_url = cfg.dea_stac_url if hasattr(cfg, 'dea_stac_url') else "https://explorer.digitalearth.africa/stac" # AOI to bbox lon, lat, radius_m = aoi deg = radius_m / 111_320.0 bbox = [lon - deg, lat - deg, lon + deg, lat + deg] # Query DEA STAC print(f"🔍 Querying DEA STAC: {stac_url}") print(f" _bbox: {bbox}") print(f" _dates: {start_date} to {end_date}") try: client = pystac_client.Client.open(stac_url) # Search for Sentinel-2 L2A search = client.search( collections=["s2_l2a"], bbox=bbox, datetime=f"{start_date}/{end_date}", query={ "eo:cloud_cover": {"lt": 30}, # Cloud filter } ) items = list(search.items()) print(f" Found {len(items)} Sentinel-2 scenes") if len(items) == 0: raise ValueError("No Sentinel-2 imagery available for the selected AOI and date range") # Load data using stackstac # Required bands: red, green, blue, nir, rededge (B5), swir1, swir2 bands = ["red", "green", "blue", "nir", "nir08", "nir09", "swir16", "swir22"] cube = stackstac.stack( items, bounds=bbox, resolution=10, # 10m (Sentinel-2 native) bands=bands, chunks={"x": 512, "y": 512}, epsg=32736, # UTM Zone 36S (Zimbabwe) ) print(f" Loaded cube shape: {cube.shape}") except Exception as e: print(f" ⚠️ DEA STAC loading failed: {e}") print(f" Returning placeholder features for development") return _build_placeholder_features(H, W, target_profile) # Extract dates from the cube cube_dates = pd.to_datetime(cube.time.values) date_strings = [d.strftime('%Y%m%d') for d in cube_dates] # Get band data - stackstac returns (T, C, H, W), transpose to (C, T, H, W) band_data = cube.values # (T, C, H, W) n_times = band_data.shape[0] # Map bands to names band_names = list(cube.band.values) # Extract individual bands def get_band_data(band_name): idx = band_names.index(band_name) if band_name in band_names else 0 # Shape: (T, H, W) return band_data[:, idx, :, :] # Build timeseries dict for each index # Compute indices for each timestep indices_list = [] # Get available bands available_bands = {} for bn in ['red', 'green', 'blue', 'nir', 'nir08', 'nir09', 'swir16', 'swir22']: if bn in band_names: available_bands[bn] = get_band_data(bn) # Compute indices for each timestep timeseries_dict = {} for t in range(n_times): # Get bands for this timestep bands_t = {k: v[t] for k, v in available_bands.items()} # Compute indices red = bands_t.get('red', None) nir = bands_t.get('nir', None) green = bands_t.get('green', None) blue = bands_t.get('blue', None) nir08 = bands_t.get('nir08', None) # B8A (red-edge) swir16 = bands_t.get('swir16', None) # B11 swir22 = bands_t.get('swir22', None) # B12 if red is None or nir is None: continue # Compute indices at this timestep # Use nir08 as red-edge if available, else swir16 as proxy rededge = nir08 if nir08 is not None else (swir16 if swir16 is not None else None) indices_t = compute_indices_from_bands( red=red, nir=nir, blue=blue, green=green, swir1=swir16, swir2=swir22 ) # Add NDRE and CI_RE if we have red-edge if rededge is not None: denom = nir + rededge indices_t['ndre'] = np.where(denom != 0, (nir - rededge) / denom, 0) indices_t['ci_re'] = np.where(rededge != 0, (nir / rededge) - 1, 0) # Stack into timeseries for idx_name, idx_arr in indices_t.items(): if idx_name not in timeseries_dict: timeseries_dict[idx_name] = np.zeros((H, W, n_times), dtype=np.float32) timeseries_dict[idx_name][:, :, t] = idx_arr.astype(np.float32) # Ensure at least one index exists if not timeseries_dict: print(" ⚠️ No indices computed, returning placeholders") return _build_placeholder_features(H, W, target_profile) # ======================================== # Apply Feature Engineering Pipeline # (matching train.py exactly) # ======================================== print(" 🔧 Applying feature engineering pipeline...") # 1. Apply smoothing (Savitzky-Golay) print(" - Smoothing (Savitzky-Golay window=5, polyorder=2)") smoothed_dict = apply_smoothing_to_rasters(timeseries_dict, date_strings) # 2. Extract phenology print(" - Phenology metrics (amplitude, AUC, peak, slope)") phenology_features = extract_phenology_from_rasters( smoothed_dict, date_strings, indices=['ndvi', 'ndre', 'evi', 'savi'] ) # 3. Add harmonics print(" - Harmonic features (1st/2nd order sin/cos)") harmonic_features = add_harmonics_to_rasters( smoothed_dict, date_strings, indices=['ndvi', 'ndre', 'evi'] ) # 4. Seasonal windows + interactions print(" - Seasonal windows (Early/Peak/Late) + interactions") window_features = add_seasonal_windows_and_interactions( smoothed_dict, date_strings, indices=['ndvi', 'ndwi', 'ndre'], phenology_features=phenology_features ) # ======================================== # Combine all features # ======================================== # Collect all features in order all_features = {} all_features.update(phenology_features) all_features.update(harmonic_features) all_features.update(window_features) # Get feature names in consistent order # Order: phenology (ndvi) -> phenology (ndre) -> phenology (evi) -> phenology (savi) # -> harmonics -> windows -> interactions feat_names = [] # Phenology order: ndvi, ndre, evi, savi for idx in ['ndvi', 'ndre', 'evi', 'savi']: for suffix in ['_max', '_min', '_mean', '_std', '_amplitude', '_auc', '_peak_timestep', '_max_slope_up', '_max_slope_down']: key = f'{idx}{suffix}' if key in all_features: feat_names.append(key) # Harmonics order: ndvi, ndre, evi for idx in ['ndvi', 'ndre', 'evi']: for suffix in ['_harmonic1_sin', '_harmonic1_cos', '_harmonic2_sin', '_harmonic2_cos']: key = f'{idx}{suffix}' if key in all_features: feat_names.append(key) # Window features: ndvi, ndwi, ndre (early, peak, late) for idx in ['ndvi', 'ndwi', 'ndre']: for win in ['early', 'peak', 'late']: for stat in ['_mean', '_max']: key = f'{idx}_{win}{stat}' if key in all_features: feat_names.append(key) # Interactions if 'ndvi_ndre_peak_diff' in all_features: feat_names.append('ndvi_ndre_peak_diff') if 'canopy_density_contrast' in all_features: feat_names.append('canopy_density_contrast') print(f" Total features: {len(feat_names)}") # Build feature array feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32) for i, feat_name in enumerate(feat_names): if feat_name in all_features: feat_arr[:, :, i] = all_features[feat_name] # Handle NaN/Inf feat_arr = np.nan_to_num(feat_arr, nan=0.0, posinf=0.0, neginf=0.0) # ======================================== # Build aux layers for visualization # ======================================== aux_layers = {} # True color (use first clear observation) if 'red' in available_bands and 'green' in available_bands and 'blue' in available_bands: # Get median of clear observations red_arr = available_bands['red'] # (T, H, W) green_arr = available_bands['green'] blue_arr = available_bands['blue'] # Simple median composite tc = np.stack([ np.median(red_arr, axis=0), np.median(green_arr, axis=0), np.median(blue_arr, axis=0), ], axis=-1) aux_layers['true_color'] = tc.astype(np.uint16) # Index peaks for visualization for idx in ['ndvi', 'evi', 'savi']: if f'{idx}_max' in all_features: aux_layers[f'{idx}_peak'] = all_features[f'{idx}_max'] feat_profile = target_profile.copy() feat_profile.update({"count": 1, "dtype": "float32"}) return feat_arr, feat_profile, feat_names, aux_layers def _build_placeholder_features(H: int, W: int, target_profile: dict) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]: """Build placeholder features when DEA STAC is unavailable. This allows the pipeline to run during development without API access. """ # Minimal feature set matching training expected features feat_names = ["ndvi_peak", "evi_peak", "savi_peak"] feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32) aux_layers = { "true_color": np.zeros((H, W, 3), dtype=np.uint16), "ndvi_peak": np.zeros((H, W), dtype=np.float32), "evi_peak": np.zeros((H, W), dtype=np.float32), "savi_peak": np.zeros((H, W), dtype=np.float32), } feat_profile = target_profile.copy() feat_profile.update({"count": 1, "dtype": "float32"}) return feat_arr, feat_profile, feat_names, aux_layers # ------------------------- # Neighborhood smoothing # ------------------------- def majority_filter(arr: np.ndarray, k: int = 3) -> np.ndarray: """Majority filter for 2D class label arrays. arr may be dtype string (labels) or integers. For strings, we use a slower path with unique counts. k must be odd (3,5,7). NOTE: This is a simple CPU implementation. For speed: - convert labels to ints - use scipy.ndimage or numba - or apply with rasterio/gdal focal statistics """ if k % 2 == 0 or k < 3: raise ValueError("k must be odd and >= 3") pad = k // 2 H, W = arr.shape padded = np.pad(arr, ((pad, pad), (pad, pad)), mode="edge") out = arr.copy() # If numeric, use bincount fast path if np.issubdtype(arr.dtype, np.integer): maxv = int(arr.max()) if arr.size else 0 for y in range(H): for x in range(W): win = padded[y : y + k, x : x + k].ravel() counts = np.bincount(win, minlength=maxv + 1) out[y, x] = counts.argmax() return out # String/obj path for y in range(H): for x in range(W): win = padded[y : y + k, x : x + k].ravel() vals, counts = np.unique(win, return_counts=True) out[y, x] = vals[counts.argmax()] return out