#only for reference do not change only read this original code produced highly accurate models import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import glob import os import re import subprocess import sys import warnings import joblib from scipy.signal import savgol_filter from scipy.stats import linregress from scipy.integrate import trapezoid from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, top_k_accuracy_score from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.svm import SVC # Suppress warnings warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning) warnings.filterwarnings("ignore", category=UserWarning) # Helper for some sklearn versions pd.set_option('future.no_silent_downcasting', True) # Check/Install Libraries def install_libs(): libs = ["minisom", "imbalanced-learn", "xgboost", "lightgbm", "catboost"] for lib in libs: try: __import__(lib.replace("-", "_")) except ImportError: print(f"Installing {lib}...") subprocess.check_call([sys.executable, "-m", "pip", "install", lib]) install_libs() import xgboost as xgb import lightgbm as lgb from catboost import CatBoostClassifier from google.colab import drive # ========================================== # 1. DATA LOADING # ========================================== def load_balanced_data(folder_path): print(f"Mounting Google Drive and searching in: {folder_path}") drive.mount('/content/drive') # Load the RAW balanced dataset created in previous step # We use RAW because smoothing/phenology calcs need physical units, not scaled values file_path = os.path.join(folder_path, "Zimbabwe_Crop_Balanced_Raw.csv") if os.path.exists(file_path): print(f"Loading Balanced Raw Data: {file_path}") df = pd.read_csv(file_path) print(f"Data Loaded. Shape: {df.shape}") return df else: print("Error: 'Zimbabwe_Crop_Balanced_Raw.csv' not found. Please run the balancing step first.") return None # ========================================== # 2. FEATURE ENGINEERING: SMOOTHING # ========================================== def apply_smoothing(df, indices=['ndvi', 'ndre', 'evi', 'savi', 'ci_re', 'ndwi']): print("\n[FE] Applying Savitzky-Golay Smoothing (with 0-interpolation)...") # Get dates date_pattern = re.compile(r"(\d{8})_") dates = sorted(list(set([date_pattern.search(col).group(1) for col in df.columns if date_pattern.search(col)]))) eng_df = df.copy() for idx in indices: cols = [f"{d}_{idx}" for d in dates if f"{d}_{idx}" in df.columns] if not cols: continue # 1. Replace 0 with NaN (assuming 0 indicates gap/fill) raw_df = df[cols].replace(0, np.nan) # 2. Interpolate linearly across time (axis=1) to fill gaps before smoothing interp_values = raw_df.interpolate(method='linear', axis=1, limit_direction='both').fillna(0).values # 3. Apply smoothing row-wise # window_length=5 (approx 50 days), polyorder=2 smooth_values = savgol_filter(interp_values, window_length=5, polyorder=2, axis=1) # Store smooth values with new suffix smooth_cols = [c + "_smooth" for c in cols] eng_df[smooth_cols] = smooth_values return eng_df, dates # ========================================== # 3. FEATURE ENGINEERING: PHENOLOGY # ========================================== def extract_phenology(df, dates, indices=['ndvi', 'ndre', 'evi']): print("[FE] Extracting Phenology Metrics (Green-up, Peak, Senescence)...") eng_df = df.copy() for idx in indices: # Use SMOOTHED columns if available cols = [f"{d}_{idx}_smooth" for d in dates] if not all([c in df.columns for c in cols]): cols = [f"{d}_{idx}" for d in dates] # Fallback to raw if not cols: continue values = df[cols].values # 1. Magnitude Metrics eng_df[f'{idx}_max'] = np.max(values, axis=1) eng_df[f'{idx}_min'] = np.min(values, axis=1) eng_df[f'{idx}_mean'] = np.mean(values, axis=1) eng_df[f'{idx}_std'] = np.std(values, axis=1) eng_df[f'{idx}_amplitude'] = eng_df[f'{idx}_max'] - eng_df[f'{idx}_min'] # 2. Area Under Curve (Integral) with dx=10 (10-day intervals) eng_df[f'{idx}_auc'] = trapezoid(values, dx=10, axis=1) # 3. Timing Metrics (Vectorized approximation) # Argmax gives the index of the peak peak_indices = np.argmax(values, axis=1) eng_df[f'{idx}_peak_timestep'] = peak_indices # Rates (Slopes) slopes = np.diff(values, axis=1) eng_df[f'{idx}_max_slope_up'] = np.max(slopes, axis=1) eng_df[f'{idx}_max_slope_down'] = np.min(slopes, axis=1) return eng_df # ========================================== # 4. FEATURE ENGINEERING: HARMONICS # ========================================== def add_harmonics(df, dates, indices=['ndvi']): print("[FE] Fitting Harmonic/Fourier Features (Normalized)...") eng_df = df.copy() # Convert dates to relative time (0 to 1 scaling or radians) time_steps = np.arange(len(dates)) t = 2 * np.pi * time_steps / len(dates) # Normalize to one full cycle 0-2pi sin_t = np.sin(t) cos_t = np.cos(t) sin_2t = np.sin(2*t) cos_2t = np.cos(2*t) n_dates = len(dates) for idx in indices: # Prefer smoothed values for cleaner harmonics cols = [f"{d}_{idx}_smooth" for d in dates] if not all(c in df.columns for c in cols): cols = [f"{d}_{idx}" for d in dates] if not cols: continue values = df[cols].values # Normalized coefficients eng_df[f'{idx}_harmonic1_sin'] = np.dot(values, sin_t) / n_dates eng_df[f'{idx}_harmonic1_cos'] = np.dot(values, cos_t) / n_dates eng_df[f'{idx}_harmonic2_sin'] = np.dot(values, sin_2t) / n_dates eng_df[f'{idx}_harmonic2_cos'] = np.dot(values, cos_2t) / n_dates return eng_df # ========================================== # 5. FEATURE ENGINEERING: INTERACTIONS & WINDOWS # ========================================== def add_interactions_and_windows(df, dates): print("[FE] Computing Multi-Index Interactions & Seasonal Windows...") eng_df = df.copy() # Interactions if 'ndvi_max' in df.columns and 'ndre_max' in df.columns: eng_df['ndvi_ndre_peak_diff'] = df['ndvi_max'] - df['ndre_max'] if 'evi_mean' in df.columns and 'ndvi_mean' in df.columns: eng_df['canopy_density_contrast'] = df['evi_mean'] / (df['ndvi_mean'] + 0.001) # Windowed Summaries (assuming Zimbabwe Season: Oct-Jun) # Correctly parsing dates to handle year crossover dt_dates = pd.to_datetime(dates, format='%Y%m%d') date_to_col_map = {d: d for d in dates} # Helper if needed # Define Windows (Months) windows = { 'early': [10, 11, 12], # Oct-Dec 'peak': [1, 2, 3], # Jan-Mar 'late': [4, 5, 6] # Apr-Jun } for idx in ['ndvi', 'ndwi', 'ndre']: # Prefer smooth if avail suffix = "_smooth" if f"{dates[0]}_{idx}_smooth" in df.columns else "" for win_name, months in windows.items(): # Identify columns belonging to this window relevant_dates = [d_str for d_dt, d_str in zip(dt_dates, dates) if d_dt.month in months] relevant_cols = [f"{d}_{idx}{suffix}" for d in relevant_dates if f"{d}_{idx}{suffix}" in df.columns] if relevant_cols: eng_df[f'{idx}_{win_name}_mean'] = df[relevant_cols].mean(axis=1) eng_df[f'{idx}_{win_name}_max'] = df[relevant_cols].max(axis=1) return eng_df # ========================================== # 6. FEATURE SELECTION (Helper) # ========================================== def get_selected_feature_names(X_train, y_train): print("\n" + "="*40) print("FEATURE SELECTION (Inside Train Split)") print("="*40) # 1. Train scout LightGBM print("Training scout LightGBM model...") lgbm = lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1) lgbm.fit(X_train, y_train) # 2. Get Importance importances = pd.DataFrame({ 'Feature': X_train.columns, 'Importance': lgbm.feature_importances_ }).sort_values('Importance', ascending=False) print("Top 10 Features:") print(importances.head(10)) # 3. Keep non-zero importance features selected_feats = importances[importances['Importance'] > 0]['Feature'].tolist() print(f"Kept {len(selected_feats)} features (dropped {len(X_train.columns) - len(selected_feats)} zero-importance).") return selected_feats # ========================================== # 7. MODEL EVALUATION VISUALIZATIONS # ========================================== def plot_confusion_matrix_custom(y_true, y_pred, classes, model_name, folder_path, dataset_suffix=""): cm = confusion_matrix(y_true, y_pred) plt.figure(figsize=(14, 12)) # Use annotation only if reasonable number of classes annot = len(classes) < 25 sns.heatmap(cm, annot=annot, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes, cbar=False) plt.title(f'Confusion Matrix: {model_name} ({dataset_suffix})', fontsize=16) plt.xlabel('Predicted Label', fontsize=12) plt.ylabel('True Label', fontsize=12) plt.xticks(rotation=45, ha='right') plt.yticks(rotation=0) plt.tight_layout() plt.savefig(os.path.join(folder_path, f"ConfMat_{model_name}_{dataset_suffix}.png")) plt.show() plt.close() def plot_feature_importance_custom(model, feature_names, model_name, folder_path, dataset_suffix="", top_n=20): if not hasattr(model, 'feature_importances_'): return imp = model.feature_importances_ df_imp = pd.DataFrame({'feature': feature_names, 'importance': imp}) df_imp = df_imp.sort_values('importance', ascending=False).head(top_n) plt.figure(figsize=(10, 8)) sns.barplot(x='importance', y='feature', data=df_imp, palette='viridis') plt.title(f'Top {top_n} Features: {model_name} ({dataset_suffix})', fontsize=16) plt.xlabel('Importance Score') plt.tight_layout() plt.savefig(os.path.join(folder_path, f"FeatImp_{model_name}_{dataset_suffix}.png")) plt.show() plt.close() def plot_per_class_f1(y_true, y_pred, classes, model_name, folder_path, dataset_suffix=""): # Fix: Added zero_division=0 to suppress UndefinedMetricWarning for empty classes report = classification_report(y_true, y_pred, target_names=classes, output_dict=True, zero_division=0) df_report = pd.DataFrame(report).transpose() # Drop avg rows df_report = df_report.drop(['accuracy', 'macro avg', 'weighted avg'], errors='ignore') plt.figure(figsize=(14, 6)) sns.barplot(x=df_report.index, y=df_report['f1-score'], palette='coolwarm') plt.title(f'F1-Score per Class: {model_name} ({dataset_suffix})', fontsize=16) plt.xticks(rotation=45, ha='right') plt.ylim(0, 1.05) plt.grid(axis='y', alpha=0.3) plt.tight_layout() plt.savefig(os.path.join(folder_path, f"ClassF1_{model_name}_{dataset_suffix}.png")) plt.show() plt.close() # Save Report to CSV df_report.to_csv(os.path.join(folder_path, f"Report_{model_name}_{dataset_suffix}.csv")) # ========================================== # 8. TRAINING & COMPARISON LOOP # ========================================== def train_and_compare(df, folder_path, dataset_suffix="Raw"): print("\n" + "="*40) print(f"MODEL TRAINING & COMPARISON ({dataset_suffix} Data)") print("="*40) # 1. Drop Junk Columns (Spatial Leakage Prevention) # Explicitly including 'is_syn' and 'system:index' as requested junk_cols = ['.geo', 'system:index', 'latitude', 'longitude', 'lat', 'lon', 'ID', 'parent_id', 'batch_id', 'is_syn'] # Filter only those present cols_to_drop = [c for c in junk_cols if c in df.columns] print(f"Dropping junk/spatial columns: {cols_to_drop}") df_clean = df.drop(columns=cols_to_drop) X = df_clean.drop(columns=['label']) y = df_clean['label'] le = LabelEncoder() y_enc = le.fit_transform(y) class_names = le.classes_ # 2. Split Data FIRST (Fixing Data Leakage) X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42, stratify=y_enc) # 3. Feature Selection on X_train ONLY selected_features = get_selected_feature_names(X_train, y_train) # Apply selection X_train = X_train[selected_features] X_test = X_test[selected_features] print(f"Final Feature Count: {X_train.shape[1]}") # Define Models with Robust Configs models = { 'RandomForest': RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42, class_weight='balanced'), 'XGBoost': xgb.XGBClassifier( n_estimators=300, learning_rate=0.05, max_depth=7, subsample=0.8, colsample_bytree=0.8, eval_metric='mlogloss', n_jobs=-1, random_state=42 ), 'LightGBM': lgb.LGBMClassifier( n_estimators=800, learning_rate=0.03, num_leaves=63, subsample=0.8, colsample_bytree=0.8, min_child_samples=30, class_weight='balanced', n_jobs=-1, random_state=42, verbose=-1 ), 'CatBoost': CatBoostClassifier( iterations=500, learning_rate=0.05, depth=6, verbose=0, random_seed=42, auto_class_weights='Balanced' ) } results = [] trained_models = {} for name, model in models.items(): print(f"\n--- Training {name} ({dataset_suffix}) ---") model.fit(X_train, y_train) # Probabilities needed for Top-K if hasattr(model, "predict_proba"): probs = model.predict_proba(X_test) preds = np.argmax(probs, axis=1) # Check k constraints (k must be < n_classes) k = 3 if len(class_names) > 3 else 1 top_k_acc = top_k_accuracy_score(y_test, probs, k=k) else: preds = model.predict(X_test) top_k_acc = 0.0 # Metrics # Zero division check handled inside sklearn for some versions, but standard accuracy/f1 are fine acc = accuracy_score(y_test, preds) f1_macro = f1_score(y_test, preds, average='macro', zero_division=0) f1_weighted = f1_score(y_test, preds, average='weighted', zero_division=0) print(f"{name} -> Acc: {acc:.4f}, F1-Macro: {f1_macro:.4f}, F1-Weighted: {f1_weighted:.4f}, Top-{k}: {top_k_acc:.4f}") # Store Results results.append({ 'Model': name, 'Dataset': dataset_suffix, 'Accuracy': acc, 'F1-Macro': f1_macro, 'F1-Weighted': f1_weighted, f'Top-{k} Acc': top_k_acc }) trained_models[name] = model # Save Model PKL pkl_path = os.path.join(folder_path, f"Zimbabwe_{name}_{dataset_suffix}_Model.pkl") joblib.dump(model, pkl_path) # VISUALIZATIONS plot_confusion_matrix_custom(y_test, preds, class_names, name, folder_path, dataset_suffix) plot_per_class_f1(y_test, preds, class_names, name, folder_path, dataset_suffix) plot_feature_importance_custom(model, selected_features, name, folder_path, dataset_suffix) # --- Voting Ensemble --- print(f"\n--- Training Voting Ensemble (Soft Vote) - {dataset_suffix} ---") ensemble = VotingClassifier( estimators=[(n, m) for n, m in trained_models.items()], voting='soft', n_jobs=-1 ) ensemble.fit(X_train, y_train) ens_probs = ensemble.predict_proba(X_test) ens_preds = np.argmax(ens_probs, axis=1) k = 3 if len(class_names) > 3 else 1 ens_acc = accuracy_score(y_test, ens_preds) ens_f1 = f1_score(y_test, ens_preds, average='macro', zero_division=0) ens_f1_w = f1_score(y_test, ens_preds, average='weighted', zero_division=0) ens_topk = top_k_accuracy_score(y_test, ens_probs, k=k) print(f"Ensemble -> Acc: {ens_acc:.4f}, F1-Macro: {ens_f1:.4f}, F1-Weighted: {ens_f1_w:.4f}, Top-{k}: {ens_topk:.4f}") # Save Ensemble & Plots joblib.dump(ensemble, os.path.join(folder_path, f"Zimbabwe_Ensemble_{dataset_suffix}_Model.pkl")) plot_confusion_matrix_custom(y_test, ens_preds, class_names, "Ensemble", folder_path, dataset_suffix) plot_per_class_f1(y_test, ens_preds, class_names, "Ensemble", folder_path, dataset_suffix) # Final Comparison Plot results.append({ 'Model': 'Ensemble', 'Dataset': dataset_suffix, 'Accuracy': ens_acc, 'F1-Macro': ens_f1, 'F1-Weighted': ens_f1_w, f'Top-{k} Acc': ens_topk }) return results # ========================================== # MAIN EXECUTION # ========================================== if __name__ == "__main__": print("\n--- STARTING UPDATED CROP MAPPING SCRIPT ---") FOLDER_PATH = '/content/drive/MyDrive/GEE_Crop_Mapping_V2_Universal' # 1. Load Data df = load_balanced_data(FOLDER_PATH) if df is not None: # 2. Feature Engineering Pipeline df_smooth, dates = apply_smoothing(df) df_pheno = extract_phenology(df_smooth, dates) df_harm = add_harmonics(df_pheno, dates) df_full = add_interactions_and_windows(df_harm, dates) # Save Engineered Dataset (All columns, before split/selection) eng_path = os.path.join(FOLDER_PATH, "Zimbabwe_Crop_Engineered_Ready.csv") df_full.to_csv(eng_path, index=False) print(f"\nSaved Engineered Dataset (All Features): {eng_path}") # 3. Prepare Two Datasets: Raw vs Scaled print("\nPreparing Datasets: Raw vs StandardScaled...") # Raw is just df_full df_raw = df_full.copy() # Scaled: Apply StandardScaler to features only df_scaled = df_full.copy() features = df_scaled.drop(columns=['label']).columns scaler = StandardScaler() df_scaled[features] = scaler.fit_transform(df_scaled[features]) # 4. Train & Compare Both all_results = [] # Train on Raw res_raw = train_and_compare(df_raw, FOLDER_PATH, dataset_suffix="Raw") all_results.extend(res_raw) # Train on Scaled res_scaled = train_and_compare(df_scaled, FOLDER_PATH, dataset_suffix="Scaled") all_results.extend(res_scaled) # 5. Final Combined Comparison final_df = pd.DataFrame(all_results) print("\nFinal Comparative Results (Raw vs Scaled):") print(final_df[['Model', 'Dataset', 'Accuracy', 'F1-Macro']]) # Combined Plot plt.figure(figsize=(14, 7)) sns.barplot(data=final_df, x='Model', y='F1-Macro', hue='Dataset', palette='Paired') plt.title("Model Comparison: Raw vs StandardScaled Features (F1-Macro)", fontsize=16) plt.ylim(0, 1.05) plt.grid(axis='y', alpha=0.3) plt.legend(title='Dataset Version', loc='lower right') plt.tight_layout() plt.savefig(os.path.join(FOLDER_PATH, "Final_Comparison_Raw_vs_Scaled.png")) plt.show()