"""Feature engineering + geospatial helpers for GeoCrop. This module is shared by training (feature selection + scaling helpers) AND inference (DEA STAC fetch + raster alignment + smoothing). IMPORTANT: This implementation exactly replicates train.py feature engineering: - Savitzky-Golay smoothing (window=5, polyorder=2) with 0-interpolation - Phenology metrics (amplitude, AUC, peak_timestep, max_slope) - Harmonic/Fourier features (1st and 2nd order sin/cos) - Seasonal window statistics (Early: Oct-Dec, Peak: Jan-Mar, Late: Apr-Jun) """ from __future__ import annotations import json import re from dataclasses import dataclass from datetime import date from typing import Dict, Iterable, List, Optional, Tuple import numpy as np import pandas as pd # Raster / geo import rasterio from rasterio.enums import Resampling # ========================================== # Training helpers # ========================================== def drop_junk_columns(df: pd.DataFrame, junk_cols: List[str]) -> pd.DataFrame: """Drop junk/spatial columns that would cause data leakage. Matches train.py junk_cols: ['.geo', 'system:index', 'latitude', 'longitude', 'lat', 'lon', 'ID', 'parent_id', 'batch_id', 'is_syn'] """ cols_to_drop = [c for c in junk_cols if c in df.columns] return df.drop(columns=cols_to_drop) def scout_feature_selection( X_train: pd.DataFrame, y_train: np.ndarray, n_estimators: int = 100, random_state: int = 42, ) -> List[str]: """Scout LightGBM feature selection (keeps non-zero importances).""" import lightgbm as lgb lgbm = lgb.LGBMClassifier(n_estimators=n_estimators, random_state=random_state, verbose=-1) lgbm.fit(X_train, y_train) importances = pd.DataFrame( {"Feature": X_train.columns, "Importance": lgbm.feature_importances_} ).sort_values("Importance", ascending=False) selected = importances[importances["Importance"] > 0]["Feature"].tolist() if not selected: # Fallback: keep everything (better than breaking training) selected = list(X_train.columns) return selected def scale_numeric_features( X_train: pd.DataFrame, X_test: pd.DataFrame, ): """Scale only numeric columns, return (X_train_scaled, X_test_scaled, scaler). Uses StandardScaler (matches train.py). """ from sklearn.preprocessing import StandardScaler scaler = StandardScaler() num_cols = X_train.select_dtypes(include=[np.number]).columns X_train_scaled = X_train.copy() X_test_scaled = X_test.copy() X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols]) X_test_scaled[num_cols] = scaler.transform(X_test[num_cols]) return X_train_scaled, X_test_scaled, scaler # ========================================== # INFERENCE-ONLY FEATURE ENGINEERING # These functions replicate train.py for raster-based inference # ========================================== def apply_smoothing_to_rasters( timeseries_dict: Dict[str, np.ndarray], dates: List[str] ) -> Dict[str, np.ndarray]: """Apply Savitzky-Golay smoothing to time-series raster arrays. Replicates train.py apply_smoothing(): 1. Replace 0 with NaN 2. Linear interpolate across time axis, fillna(0) 3. Savitzky-Golay: window_length=5, polyorder=2 Args: timeseries_dict: Dict mapping index name to (H, W, T) array dates: List of date strings in YYYYMMDD format Returns: Dict mapping index name to smoothed (H, W, T) array """ from scipy.signal import savgol_filter smoothed = {} n_times = len(dates) for idx_name, arr in timeseries_dict.items(): # arr shape: (H, W, T) H, W, T = arr.shape # Reshape to (H*W, T) for vectorized processing arr_2d = arr.reshape(-1, T) # 1. Replace 0 with NaN arr_2d = np.where(arr_2d == 0, np.nan, arr_2d) # 2. Linear interpolate across time axis (axis=1) # Handle each row (each pixel) independently interp_rows = [] for row in arr_2d: # Use pandas Series for linear interpolation ser = pd.Series(row) ser = ser.interpolate(method='linear', limit_direction='both') interp_rows.append(ser.fillna(0).values) interp_arr = np.array(interp_rows) # 3. Apply Savitzky-Golay smoothing # window_length=5, polyorder=2 smooth_arr = savgol_filter(interp_arr, window_length=5, polyorder=2, axis=1) # Reshape back to (H, W, T) smoothed[idx_name] = smooth_arr.reshape(H, W, T) return smoothed def extract_phenology_from_rasters( timeseries_dict: Dict[str, np.ndarray], dates: List[str], indices: List[str] = ['ndvi', 'ndre', 'evi'] ) -> Dict[str, np.ndarray]: """Extract phenology metrics from time-series raster arrays. Replicates train.py extract_phenology(): - Magnitude: max, min, mean, std, amplitude - AUC: trapezoid integral with dx=10 - Timing: peak_timestep (argmax) - Slopes: max_slope_up, max_slope_down Args: timeseries_dict: Dict mapping index name to (H, W, T) array (should be smoothed) dates: List of date strings indices: Which indices to process Returns: Dict mapping feature name to (H, W) array """ from scipy.integrate import trapezoid features = {} for idx in indices: if idx not in timeseries_dict: continue arr = timeseries_dict[idx] # (H, W, T) H, W, T = arr.shape # Reshape to (H*W, T) for vectorized processing arr_2d = arr.reshape(-1, T) # Magnitude Metrics features[f'{idx}_max'] = np.max(arr_2d, axis=1).reshape(H, W) features[f'{idx}_min'] = np.min(arr_2d, axis=1).reshape(H, W) features[f'{idx}_mean'] = np.mean(arr_2d, axis=1).reshape(H, W) features[f'{idx}_std'] = np.std(arr_2d, axis=1).reshape(H, W) features[f'{idx}_amplitude'] = features[f'{idx}_max'] - features[f'{idx}_min'] # AUC (Area Under Curve) with dx=10 (10-day intervals) features[f'{idx}_auc'] = trapezoid(arr_2d, dx=10, axis=1).reshape(H, W) # Peak timestep (timing) peak_indices = np.argmax(arr_2d, axis=1) features[f'{idx}_peak_timestep'] = peak_indices.reshape(H, W) # Slopes (rates of change) slopes = np.diff(arr_2d, axis=1) # (H*W, T-1) features[f'{idx}_max_slope_up'] = np.max(slopes, axis=1).reshape(H, W) features[f'{idx}_max_slope_down'] = np.min(slopes, axis=1).reshape(H, W) return features def add_harmonics_to_rasters( timeseries_dict: Dict[str, np.ndarray], dates: List[str], indices: List[str] = ['ndvi'] ) -> Dict[str, np.ndarray]: """Add harmonic/fourier features from time-series raster arrays. Replicates train.py add_harmonics(): - 1st order: sin(t), cos(t) - 2nd order: sin(2t), cos(2t) where t = 2*pi * time_step / n_times Args: timeseries_dict: Dict mapping index name to (H, W, T) array (should be smoothed) dates: List of date strings indices: Which indices to process Returns: Dict mapping feature name to (H, W) array """ features = {} n_times = len(dates) # Normalize time to 0-2pi (one full cycle) time_steps = np.arange(n_times) t = 2 * np.pi * time_steps / n_times sin_t = np.sin(t) cos_t = np.cos(t) sin_2t = np.sin(2 * t) cos_2t = np.cos(2 * t) for idx in indices: if idx not in timeseries_dict: continue arr = timeseries_dict[idx] # (H, W, T) H, W, T = arr.shape # Reshape to (H*W, T) for vectorized processing arr_2d = arr.reshape(-1, T) # Normalized dot products (harmonic coefficients) features[f'{idx}_harmonic1_sin'] = np.dot(arr_2d, sin_t) / n_times features[f'{idx}_harmonic1_cos'] = np.dot(arr_2d, cos_t) / n_times features[f'{idx}_harmonic2_sin'] = np.dot(arr_2d, sin_2t) / n_times features[f'{idx}_harmonic2_cos'] = np.dot(arr_2d, cos_2t) / n_times # Reshape back to (H, W) for feat_name in [f'{idx}_harmonic1_sin', f'{idx}_harmonic1_cos', f'{idx}_harmonic2_sin', f'{idx}_harmonic2_cos']: features[feat_name] = features[feat_name].reshape(H, W) return features def add_seasonal_windows_and_interactions( timeseries_dict: Dict[str, np.ndarray], dates: List[str], indices: List[str] = ['ndvi', 'ndwi', 'ndre'], phenology_features: Dict[str, np.ndarray] = None ) -> Dict[str, np.ndarray]: """Add seasonal window statistics and index interactions. Replicates train.py add_interactions_and_windows(): - Seasonal windows (Zimbabwe season: Oct-Jun): - Early: Oct-Dec (months 10, 11, 12) - Peak: Jan-Mar (months 1, 2, 3) - Late: Apr-Jun (months 4, 5, 6) - Interactions: - ndvi_ndre_peak_diff = ndvi_max - ndre_max - canopy_density_contrast = evi_mean / (ndvi_mean + 0.001) Args: timeseries_dict: Dict mapping index name to (H, W, T) array dates: List of date strings in YYYYMMDD format indices: Which indices to process phenology_features: Dict of phenology features for interactions Returns: Dict mapping feature name to (H, W) array """ features = {} # Parse dates to identify months dt_dates = pd.to_datetime(dates, format='%Y%m%d') # Define seasonal windows (months) windows = { 'early': [10, 11, 12], # Oct-Dec 'peak': [1, 2, 3], # Jan-Mar 'late': [4, 5, 6] # Apr-Jun } for idx in indices: if idx not in timeseries_dict: continue arr = timeseries_dict[idx] # (H, W, T) H, W, T = arr.shape for win_name, months in windows.items(): # Find time indices belonging to this window month_mask = np.array([d.month in months for d in dt_dates]) if not np.any(month_mask): continue # Extract window slice window_arr = arr[:, :, month_mask] # (H, W, T_window) # Compute statistics window_2d = window_arr.reshape(-1, window_arr.shape[2]) features[f'{idx}_{win_name}_mean'] = np.mean(window_2d, axis=1).reshape(H, W) features[f'{idx}_{win_name}_max'] = np.max(window_2d, axis=1).reshape(H, W) # Add interactions (if phenology features available) if phenology_features is not None: # ndvi_ndre_peak_diff if 'ndvi_max' in phenology_features and 'ndre_max' in phenology_features: features['ndvi_ndre_peak_diff'] = ( phenology_features['ndvi_max'] - phenology_features['ndre_max'] ) # canopy_density_contrast if 'evi_mean' in phenology_features and 'ndvi_mean' in phenology_features: features['canopy_density_contrast'] = ( phenology_features['evi_mean'] / (phenology_features['ndvi_mean'] + 0.001) ) return features # ========================================== # Inference helpers # ========================================== # AOI tuple: (lon, lat, radius_m) AOI = Tuple[float, float, float] def validate_aoi_zimbabwe(aoi: AOI, max_radius_m: float = 5000.0): """Basic AOI validation. - Ensures radius <= max_radius_m - Ensures AOI center is within rough Zimbabwe bounds. NOTE: For production, use a real Zimbabwe polygon and check circle intersects. You can load a simplified boundary GeoJSON and use shapely. """ lon, lat, radius_m = aoi if radius_m <= 0 or radius_m > max_radius_m: raise ValueError(f"radius_m must be in (0, {max_radius_m}]") # Rough bbox for Zimbabwe (good cheap pre-check). # Lon: 25.2 to 33.1, Lat: -22.5 to -15.6 if not (25.2 <= lon <= 33.1 and -22.5 <= lat <= -15.6): raise ValueError("AOI must be within Zimbabwe") def clip_raster_to_aoi( src_path: str, aoi: AOI, dst_profile_like: Optional[dict] = None, ) -> Tuple[np.ndarray, dict]: """Clip a raster to AOI circle. Template implementation: reads a window around the circle's bbox. For exact circle mask, add a mask step after reading. """ lon, lat, radius_m = aoi with rasterio.open(src_path) as src: # Approx bbox from radius using rough degrees conversion. # Production: use pyproj geodesic buffer. deg = radius_m / 111_320.0 minx, maxx = lon - deg, lon + deg miny, maxy = lat - deg, lat + deg window = rasterio.windows.from_bounds(minx, miny, maxx, maxy, transform=src.transform) window = window.round_offsets().round_lengths() arr = src.read(1, window=window) profile = src.profile.copy() # Update transform for the window profile.update( { "height": arr.shape[0], "width": arr.shape[1], "transform": rasterio.windows.transform(window, src.transform), } ) # Optional: resample/align to dst_profile_like if dst_profile_like is not None: arr, profile = _resample_to_profile(arr, profile, dst_profile_like) return arr, profile def _resample_to_profile(arr: np.ndarray, src_profile: dict, dst_profile: dict) -> Tuple[np.ndarray, dict]: """Nearest-neighbor resample to match dst grid.""" dst_h = dst_profile["height"] dst_w = dst_profile["width"] dst_arr = np.empty((dst_h, dst_w), dtype=arr.dtype) with rasterio.io.MemoryFile() as mem: with mem.open(**src_profile) as src: src.write(arr, 1) rasterio.warp.reproject( source=rasterio.band(src, 1), destination=dst_arr, src_transform=src_profile["transform"], src_crs=src_profile["crs"], dst_transform=dst_profile["transform"], dst_crs=dst_profile["crs"], resampling=Resampling.nearest, ) prof = dst_profile.copy() prof.update({"count": 1, "dtype": str(dst_arr.dtype)}) return dst_arr, prof def load_dw_baseline_window(cfg, year: int, season: str, aoi: AOI) -> Tuple[np.ndarray, dict]: """Loads the DW baseline seasonal COG from MinIO and clips to AOI. The cfg.storage implementation decides whether to stream or download locally. Expected naming convention: dw_{season}_{year}.tif OR DW_Zim_HighestConf_{year}_{year+1}.tif You can implement a mapping in cfg.dw_key_for(year, season). """ local_path = cfg.storage.get_dw_local_path(year=year, season=season) # Configure Rasterio Env for MinIO /vsis3 access if needed import rasterio.env storage = cfg.storage endpoint = storage.endpoint # Ensure no http/https prefix in endpoint for GDAL if "://" in endpoint: endpoint = endpoint.split("://")[1] env_config = { "GDAL_DISABLE_READDIR_ON_OPEN": "EMPTY_DIR", } print(f" Configuring Rasterio Env for {local_path}...") with rasterio.env.Env(**env_config): arr, profile = clip_raster_to_aoi(local_path, aoi) # Ensure a single band profile profile.update({"count": 1}) if "dtype" not in profile: profile["dtype"] = str(arr.dtype) return arr, profile # ------------------------- # DEA STAC feature stack # ------------------------- def compute_indices_from_bands( red: np.ndarray, nir: np.ndarray, blue: np.ndarray = None, green: np.ndarray = None, swir1: np.ndarray = None, swir2: np.ndarray = None, rededge: np.ndarray = None ) -> Dict[str, np.ndarray]: """Compute vegetation indices from band arrays. Indices computed: - NDVI = (NIR - Red) / (NIR + Red) - EVI = 2.5 * (NIR - Red) / (NIR + 6*Red - 7.5*Blue + 1) - SAVI = ((NIR - Red) / (NIR + Red + L)) * (1 + L) where L=0.5 - NDRE = (NIR - RedEdge) / (NIR + RedEdge) - CI_RE = (NIR / RedEdge) - 1 - NDWI = (Green - NIR) / (Green + NIR) Args: red: Red band (B4) nir: NIR band (B8) blue: Blue band (B2, optional) green: Green band (B3, optional) swir1: SWIR1 band (B11, optional) swir2: SWIR2 band (B12, optional) rededge: RedEdge band (B5, optional) Returns: Dict mapping index name to array """ indices = {} # Ensure float64 for precision nir = nir.astype(np.float64) red = red.astype(np.float64) # NDVI = (NIR - Red) / (NIR + Red) denominator = nir + red indices['ndvi'] = np.where(denominator != 0, (nir - red) / denominator, 0) # EVI = 2.5 * (NIR - Red) / (NIR + 6*Red - 7.5*Blue + 1) if blue is not None: blue = blue.astype(np.float64) evi_denom = nir + 6*red - 7.5*blue + 1 indices['evi'] = np.where(evi_denom != 0, 2.5 * (nir - red) / evi_denom, 0) # SAVI = ((NIR - Red) / (NIR + Red + L)) * (1 + L) where L=0.5 L = 0.5 savi_denom = nir + red + L indices['savi'] = np.where(savi_denom != 0, ((nir - red) / savi_denom) * (1 + L), 0) # NDRE = (NIR - RedEdge) / (NIR + RedEdge) # RedEdge is typically B5 (705nm) - use NIR if not available if rededge is not None: rededge = rededge.astype(np.float64) ndre_denom = nir + rededge indices['ndre'] = np.where(ndre_denom != 0, (nir - rededge) / ndre_denom, 0) # CI_RE = (NIR / RedEdge) - 1 indices['ci_re'] = np.where(rededge != 0, (nir / rededge) - 1, 0) else: # Fallback: use SWIR1 as proxy for red-edge if available if swir1 is not None: swir1 = swir1.astype(np.float64) ndre_denom = nir + swir1 indices['ndre'] = np.where(ndre_denom != 0, (nir - swir1) / ndre_denom, 0) indices['ci_re'] = np.where(swir1 != 0, (nir / swir1) - 1, 0) # NDWI = (Green - NIR) / (Green + NIR) if green is not None: green = green.astype(np.float64) ndwi_denom = green + nir indices['ndwi'] = np.where(ndwi_denom != 0, (green - nir) / ndwi_denom, 0) return indices def build_feature_stack_from_dea( cfg, aoi: AOI, start_date: str, end_date: str, target_profile: dict, ) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]: """Query DEA STAC and compute a per-pixel feature cube. This function implements the FULL feature engineering pipeline matching train.py: 1. Load Sentinel-2 data from DEA STAC 2. Compute indices (ndvi, ndre, evi, savi, ci_re, ndwi) 3. Apply Savitzky-Golay smoothing with 0-interpolation 4. Extract phenology metrics (amplitude, AUC, peak, slope) 5. Add harmonic/fourier features 6. Add seasonal window statistics 7. Add index interactions Returns: feat_arr: (H, W, C) feat_profile: raster profile aligned to target_profile feat_names: list[str] aux_layers: dict for extra outputs (true_color, ndvi, evi, savi) """ # Import STAC dependencies try: import pystac_client import stackstac except ImportError: raise ImportError("pystac-client and stackstac are required for DEA STAC loading") from scipy.signal import savgol_filter from scipy.integrate import trapezoid H = target_profile["height"] W = target_profile["width"] # DEA STAC configuration stac_url = cfg.dea_stac_url if hasattr(cfg, 'dea_stac_url') else "https://explorer.digitalearth.africa/stac" # AOI to bbox lon, lat, radius_m = aoi deg = radius_m / 111_320.0 bbox = [lon - deg, lat - deg, lon + deg, lat + deg] # Query DEA STAC print(f"🔍 Querying DEA STAC: {stac_url}") print(f" _bbox: {bbox}") print(f" _dates: {start_date} to {end_date}") try: client = pystac_client.Client.open(stac_url) # Search for Sentinel-2 L2A search = client.search( collections=["s2_l2a"], bbox=bbox, datetime=f"{start_date}/{end_date}" ) items = list(search.items()) # Filter by cloud cover manually since query extension is deprecated/unsupported items = [it for it in items if it.properties.get("eo:cloud_cover", 100) < 30] print(f" Found {len(items)} Sentinel-2 scenes (after cloud filtering)") if len(items) == 0: raise ValueError("No Sentinel-2 imagery available for the selected AOI and date range") # Load data using stackstac # Required bands: red, green, blue, nir, rededge (B5), swir1, swir2 bands = ["red", "green", "blue", "nir", "nir08", "nir09", "swir16", "swir22"] cube = stackstac.stack( items, bounds=bbox, resolution=10, # 10m (Sentinel-2 native) bands=bands, chunks={"x": 512, "y": 512}, epsg=32736, # UTM Zone 36S (Zimbabwe) ) print(f" Loaded cube shape: {cube.shape}") except Exception as e: print(f" ⚠️ DEA STAC loading failed: {e}") print(f" Returning placeholder features for development") return _build_placeholder_features(H, W, target_profile) # Extract dates from the cube cube_dates = pd.to_datetime(cube.time.values) date_strings = [d.strftime('%Y%m%d') for d in cube_dates] # Get band data - stackstac returns (T, C, H, W), transpose to (C, T, H, W) band_data = cube.values # (T, C, H, W) n_times = band_data.shape[0] # Map bands to names band_names = list(cube.band.values) # Extract individual bands def get_band_data(band_name): idx = band_names.index(band_name) if band_name in band_names else 0 # Shape: (T, H, W) return band_data[:, idx, :, :] # Build timeseries dict for each index # Compute indices for each timestep indices_list = [] # Get available bands available_bands = {} for bn in ['red', 'green', 'blue', 'nir', 'nir08', 'nir09', 'swir16', 'swir22']: if bn in band_names: available_bands[bn] = get_band_data(bn) # Compute indices for each timestep timeseries_dict = {} for t in range(n_times): # Get bands for this timestep bands_t = {k: v[t] for k, v in available_bands.items()} # Compute indices red = bands_t.get('red', None) nir = bands_t.get('nir', None) green = bands_t.get('green', None) blue = bands_t.get('blue', None) nir08 = bands_t.get('nir08', None) # B8A (red-edge) swir16 = bands_t.get('swir16', None) # B11 swir22 = bands_t.get('swir22', None) # B12 if red is None or nir is None: continue # Compute indices at this timestep # Use nir08 as red-edge if available, else swir16 as proxy rededge = nir08 if nir08 is not None else (swir16 if swir16 is not None else None) indices_t = compute_indices_from_bands( red=red, nir=nir, blue=blue, green=green, swir1=swir16, swir2=swir22 ) # Add NDRE and CI_RE if we have red-edge if rededge is not None: denom = nir + rededge indices_t['ndre'] = np.where(denom != 0, (nir - rededge) / denom, 0) indices_t['ci_re'] = np.where(rededge != 0, (nir / rededge) - 1, 0) # Stack into timeseries for idx_name, idx_arr in indices_t.items(): if idx_name not in timeseries_dict: timeseries_dict[idx_name] = np.zeros((H, W, n_times), dtype=np.float32) timeseries_dict[idx_name][:, :, t] = idx_arr.astype(np.float32) # Ensure at least one index exists if not timeseries_dict: print(" ⚠️ No indices computed, returning placeholders") return _build_placeholder_features(H, W, target_profile) # ======================================== # Apply Feature Engineering Pipeline # (matching train.py exactly) # ======================================== print(" 🔧 Applying feature engineering pipeline...") # Check if we need V2 features from feature_computation import FEATURE_ORDER_V2, build_features_v2_for_pixel # If the caller expects V2 features (detected by length/order), use the V2 path # For raster processing, we implement a vectorized version of build_features_v2_for_pixel is_v2 = False if hasattr(cfg, 'current_feature_order') and cfg.current_feature_order == FEATURE_ORDER_V2: is_v2 = True elif hasattr(cfg, 'n_expected_features') and cfg.n_expected_features == len(FEATURE_ORDER_V2): is_v2 = True if is_v2: print(" - Using V2 Feature Pipeline (Vectorized)") feat_arr, feat_names = _compute_v2_features_vectorized(timeseries_dict, date_strings, H, W) else: # 1. Apply smoothing (Savitzky-Golay) print(" - Smoothing (Savitzky-Golay window=5, polyorder=2)") smoothed_dict = apply_smoothing_to_rasters(timeseries_dict, date_strings) # 2. Extract phenology print(" - Phenology metrics (amplitude, AUC, peak, slope)") phenology_features = extract_phenology_from_rasters( smoothed_dict, date_strings, indices=['ndvi', 'ndre', 'evi', 'savi'] ) # 3. Add harmonics print(" - Harmonic features (1st/2nd order sin/cos)") harmonic_features = add_harmonics_to_rasters( smoothed_dict, date_strings, indices=['ndvi', 'ndre', 'evi'] ) # 4. Seasonal windows + interactions print(" - Seasonal windows (Early/Peak/Late) + interactions") window_features = add_seasonal_windows_and_interactions( smoothed_dict, date_strings, indices=['ndvi', 'ndwi', 'ndre'], phenology_features=phenology_features ) # ======================================== # Combine all features # ======================================== # Collect all features in order all_features = {} all_features.update(phenology_features) all_features.update(harmonic_features) all_features.update(window_features) # Get feature names in consistent order feat_names = [] # Phenology order: ndvi, ndre, evi, savi for idx in ['ndvi', 'ndre', 'evi', 'savi']: for suffix in ['_max', '_min', '_mean', '_std', '_amplitude', '_auc', '_peak_timestep', '_max_slope_up', '_max_slope_down']: key = f'{idx}{suffix}' if key in all_features: feat_names.append(key) # Harmonics order: ndvi, ndre, evi for idx in ['ndvi', 'ndre', 'evi']: for suffix in ['_harmonic1_sin', '_harmonic1_cos', '_harmonic2_sin', '_harmonic2_cos']: key = f'{idx}{suffix}' if key in all_features: feat_names.append(key) # Window features: ndvi, ndwi, ndre (early, peak, late) for idx in ['ndvi', 'ndwi', 'ndre']: for win in ['early', 'peak', 'late']: for stat in ['_mean', '_max']: key = f'{idx}_{win}{stat}' if key in all_features: feat_names.append(key) # Interactions if 'ndvi_ndre_peak_diff' in all_features: feat_names.append('ndvi_ndre_peak_diff') if 'canopy_density_contrast' in all_features: feat_names.append('canopy_density_contrast') print(f" Total features: {len(feat_names)}") # Build feature array feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32) for i, feat_name in enumerate(feat_names): if feat_name in all_features: feat_arr[:, :, i] = all_features[feat_name] # Handle NaN/Inf feat_arr = np.nan_to_num(feat_arr, nan=0.0, posinf=0.0, neginf=0.0) # ======================================== # Build aux layers for visualization # ======================================== aux_layers = {} # True color (use first clear observation) if 'red' in available_bands and 'green' in available_bands and 'blue' in available_bands: # Get median of clear observations red_arr = available_bands['red'] # (T, H, W) green_arr = available_bands['green'] blue_arr = available_bands['blue'] # Simple median composite tc = np.stack([ np.median(red_arr, axis=0), np.median(green_arr, axis=0), np.median(blue_arr, axis=0), ], axis=-1) aux_layers['true_color'] = tc.astype(np.uint16) # Index peaks for visualization for idx in ['ndvi', 'evi', 'savi']: if f'{idx}_max' in all_features: aux_layers[f'{idx}_peak'] = all_features[f'{idx}_max'] feat_profile = target_profile.copy() feat_profile.update({"count": 1, "dtype": "float32"}) return feat_arr, feat_profile, feat_names, aux_layers def _build_placeholder_features(H: int, W: int, target_profile: dict) -> Tuple[np.ndarray, dict, List[str], Dict[str, np.ndarray]]: """Build placeholder features when DEA STAC is unavailable. This allows the pipeline to run during development without API access. """ # Minimal feature set matching training expected features feat_names = ["ndvi_peak", "evi_peak", "savi_peak"] feat_arr = np.zeros((H, W, len(feat_names)), dtype=np.float32) aux_layers = { "true_color": np.zeros((H, W, 3), dtype=np.uint16), "ndvi_peak": np.zeros((H, W), dtype=np.float32), "evi_peak": np.zeros((H, W), dtype=np.float32), "savi_peak": np.zeros((H, W), dtype=np.float32), } feat_profile = target_profile.copy() feat_profile.update({"count": 1, "dtype": "float32"}) return feat_arr, feat_profile, feat_names, aux_layers # ------------------------- # Neighborhood smoothing # ------------------------- def _compute_v2_features_vectorized(timeseries_dict, dates, H, W) -> Tuple[np.ndarray, List[str]]: """Vectorized version of build_features_v2_for_pixel for raster arrays.""" from scipy.stats import skew, kurtosis from scipy.integrate import simpson from feature_computation import FEATURE_ORDER_V2, V2_INDICES, smooth_series, harmonic_features dt_dates = pd.to_datetime(dates, format='%Y%m%d') t_days = (dt_dates - dt_dates[0]).days.values n_times = len(dates) # Pre-compute smoothed series for all required indices smoothed = {} for idx in set(V2_INDICES + ["ndwi"]): if idx in timeseries_dict: arr = timeseries_dict[idx] # (H, W, T) T = arr.shape[2] arr_2d = arr.reshape(-1, T) # Apply smoothing to each pixel (vectorized smoothing is harder, so we loop pixels) # Optimization: only smooth non-zero pixels smoothed_arr_2d = np.zeros_like(arr_2d) for i in range(arr_2d.shape[0]): if np.any(arr_2d[i] > 0): smoothed_arr_2d[i] = smooth_series(arr_2d[i]) smoothed[idx] = smoothed_arr_2d # (H*W, T) all_features = {} # Stats and Phenology for V2_INDICES for idx in V2_INDICES: if idx not in smoothed: continue arr_2d = smoothed[idx] # (H*W, T) # Central Tendency & Dispersion all_features[f"{idx}_mean"] = np.mean(arr_2d, axis=1) all_features[f"{idx}_median"] = np.median(arr_2d, axis=1) all_features[f"{idx}_min"] = np.min(arr_2d, axis=1) all_features[f"{idx}_max"] = np.max(arr_2d, axis=1) all_features[f"{idx}_std"] = np.std(arr_2d, axis=1) # Range & Percentiles all_features[f"{idx}_p10"] = np.percentile(arr_2d, 10, axis=1) all_features[f"{idx}_p25"] = np.percentile(arr_2d, 25, axis=1) all_features[f"{idx}_p75"] = np.percentile(arr_2d, 75, axis=1) all_features[f"{idx}_p90"] = np.percentile(arr_2d, 90, axis=1) # Shape Metrics (scipy skew/kurtosis are vectorized) all_features[f"{idx}_skew"] = skew(arr_2d, axis=1) all_features[f"{idx}_kurtosis"] = kurtosis(arr_2d, axis=1) # CV all_features[f"{idx}_cv"] = all_features[f"{idx}_std"] / (all_features[f"{idx}_mean"] + 0.001) # Integrals (Large) all_features[f"{idx}_large_integral"] = simpson(y=arr_2d, x=t_days, axis=1) # Phenology all_features[f"{idx}_pos_step"] = np.argmax(arr_2d, axis=1).astype(np.float32) amp = all_features[f"{idx}_max"] - all_features[f"{idx}_min"] thresh = all_features[f"{idx}_min"] + 0.2 * amp sos_list, eos_list = [], [] for i in range(arr_2d.shape[0]): cross = np.where(arr_2d[i] > thresh[i])[0] if len(cross) > 0: sos_list.append(cross[0]) eos_list.append(cross[-1]) else: sos_list.append(0) eos_list.append(n_times - 1) all_features[f"{idx}_sos_step"] = np.array(sos_list, dtype=np.float32) all_features[f"{idx}_eos_step"] = np.array(eos_list, dtype=np.float32) all_features[f"{idx}_los"] = all_features[f"{idx}_eos_step"] - all_features[f"{idx}_sos_step"] # Rates diffs = np.diff(arr_2d, axis=1) dt = np.diff(t_days) rates = diffs / dt all_features[f"{idx}_max_greenup_slope"] = np.max(rates, axis=1) all_features[f"{idx}_max_senescence_slope"] = np.min(rates, axis=1) # Small Integral (loop pixels as integration limits vary) small_ints = [] for i in range(arr_2d.shape[0]): s, e = int(sos_list[i]), int(eos_list[i]) if e > s: val = simpson(y=arr_2d[i, s:e+1], x=t_days[s:e+1]) small_ints.append(val) else: small_ints.append(0.0) all_features[f"{idx}_small_integral"] = np.array(small_ints, dtype=np.float32) # Difference Features jan_idx = [i for i, d in enumerate(dt_dates) if d.month == 1] nov_idx = [i for i, d in enumerate(dt_dates) if d.month == 11] if jan_idx and nov_idx and "ndvi" in smoothed: all_features["ndvi_jan_nov_diff"] = smoothed["ndvi"][:, jan_idx[0]] - smoothed["ndvi"][:, nov_idx[0]] else: all_features["ndvi_jan_nov_diff"] = np.zeros(H*W) # Interactions if "ndvi_max" in all_features and "ndre_max" in all_features: all_features["ndvi_ndre_peak_diff"] = all_features["ndvi_max"] - all_features["ndre_max"] if "evi_mean" in all_features and "ndvi_mean" in all_features: all_features["canopy_density_contrast"] = all_features["evi_mean"] / (all_features["ndvi_mean"] + 0.001) # Harmonics for idx in ["ndvi", "ndre", "evi"]: if idx in smoothed: # We can use the existing add_harmonics_to_rasters logic or similar # For simplicity, we just use the existing one but reshape it harms = add_harmonics_to_rasters({idx: timeseries_dict[idx]}, dates, indices=[idx]) for h_name, h_arr in harms.items(): all_features[h_name] = h_arr.reshape(-1) # Window features window_results = add_seasonal_windows_and_interactions(timeseries_dict, dates, indices=['ndvi', 'ndwi', 'ndre']) for w_name, w_arr in window_results.items(): all_features[w_name] = w_arr.reshape(-1) # Build final array based on FEATURE_ORDER_V2 feat_arr = np.zeros((H * W, len(FEATURE_ORDER_V2)), dtype=np.float32) for i, name in enumerate(FEATURE_ORDER_V2): if name in all_features: feat_arr[:, i] = all_features[name] return feat_arr.reshape(H, W, -1), FEATURE_ORDER_V2 def majority_filter(arr: np.ndarray, k: int = 3) -> np.ndarray: """Majority filter for 2D class label arrays. arr may be dtype string (labels) or integers. For strings, we use a slower path with unique counts. k must be odd (3,5,7). NOTE: This is a simple CPU implementation. For speed: - convert labels to ints - use scipy.ndimage or numba - or apply with rasterio/gdal focal statistics """ if k % 2 == 0 or k < 3: raise ValueError("k must be odd and >= 3") pad = k // 2 H, W = arr.shape padded = np.pad(arr, ((pad, pad), (pad, pad)), mode="edge") out = arr.copy() # If numeric, use bincount fast path if np.issubdtype(arr.dtype, np.integer): maxv = int(arr.max()) if arr.size else 0 for y in range(H): for x in range(W): win = padded[y : y + k, x : x + k].ravel() counts = np.bincount(win, minlength=maxv + 1) out[y, x] = counts.argmax() return out # String/obj path for y in range(H): for x in range(W): win = padded[y : y + k, x : x + k].ravel() vals, counts = np.unique(win, return_counts=True) out[y, x] = vals[counts.argmax()] return out